In [1]:
# %pip install pip==21.0 
# %pip install setuptools==65.5.0
# %pip install torch torchvision torchaudio gym==0.19.0 numpy==1.23.5 matplotlib==3.5.3 tqdm
# %pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
"""
In Terminal inside venv
git clone https://github.com/f1tenth/f1tenth_gym.git
cd f1tenth_gym
pip install -e .
"""


'\nIn Terminal inside venv\ngit clone https://github.com/f1tenth/f1tenth_gym.git\ncd f1tenth_gym\npip install -e .\n'

## Imports and Setup

In [2]:
import gym
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.distributions import Normal
import matplotlib.pyplot as plt
from tqdm import tqdm
import time
import yaml
from argparse import Namespace
from collections import deque
import random

# Set device (prefer GPU if available)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")


Using device: cpu


In [3]:
SEED = 42
torch.manual_seed(SEED)
np.random.seed(SEED)
random.seed(SEED)

# Define hyperparameters
HIDDEN_DIM = 512  # Increased network capacity
LEARNING_RATE = 3e-4  # Higher learning rate with scheduling
GAMMA = 0.99  # Higher discount factor for long-term planning
LAMBDA = 0.95  # GAE parameter
CLIP_EPSILON = 0.2  # PPO clipping parameter
VF_COEFF = 0.5  # Value function loss coefficient
ENT_COEFF = 0.01  # Entropy coefficient
PPO_EPOCHS = 10  # More PPO epochs per rollout
GRAD_CLIP = 0.5  # Gradient clipping threshold
BATCH_SIZE = 64  # Mini-batch size
LR_DECAY = 0.9999  # Learning rate decay factor

## Actor and Critic Networks

In [4]:
class ActorNet(nn.Module):
    def __init__(self, obs_dim, act_dim, hidden_dim=256):
        super().__init__()
        self.fc1 = nn.Linear(obs_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, hidden_dim)
        self.fc3 = nn.Linear(hidden_dim, hidden_dim)
        
        # Separate heads for mean and log_std
        self.mu_head = nn.Linear(hidden_dim, act_dim)
        self.log_std_head = nn.Linear(hidden_dim, act_dim)
        
        # Initialize weights properly
        nn.init.orthogonal_(self.fc1.weight, gain=np.sqrt(2))
        nn.init.orthogonal_(self.fc2.weight, gain=np.sqrt(2))
        nn.init.orthogonal_(self.fc3.weight, gain=np.sqrt(2))
        nn.init.orthogonal_(self.mu_head.weight, gain=0.01)
        nn.init.orthogonal_(self.log_std_head.weight, gain=0.01)
        
        # Action limits for F1TENTH (steering, acceleration)
        self.action_bounds = {
            'low': torch.tensor([-0.4, 0.0], device=device),
            'high': torch.tensor([0.4, 10.0], device=device)
        }

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        
        mu = self.mu_head(x)
        
        # Bound mean to valid action range using tanh
        mu = torch.tanh(mu)  # Output in [-1, 1]
        # Scale to action range
        mu = (mu + 1) / 2 * (self.action_bounds['high'] - self.action_bounds['low']) + self.action_bounds['low']
        
        # State-dependent log_std with constraints
        log_std = self.log_std_head(x)
        log_std = torch.clamp(log_std, -20, 2)  # Prevent extreme values
        std = torch.exp(log_std)
        
        return mu, std

# Improved Critic Network
class CriticNet(nn.Module):
    def __init__(self, obs_dim, hidden_dim=256):
        super().__init__()
        self.fc1 = nn.Linear(obs_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, hidden_dim)
        self.fc3 = nn.Linear(hidden_dim, hidden_dim)
        self.out = nn.Linear(hidden_dim, 1)
        
        # Initialize weights
        nn.init.orthogonal_(self.fc1.weight, gain=np.sqrt(2))
        nn.init.orthogonal_(self.fc2.weight, gain=np.sqrt(2))
        nn.init.orthogonal_(self.fc3.weight, gain=np.sqrt(2))
        nn.init.orthogonal_(self.out.weight, gain=1.0)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        return self.out(x)

## Observations

In [5]:
class RunningMeanStd:
    def __init__(self, shape=(), epsilon=1e-4):
        self.mean = np.zeros(shape, dtype=np.float32)
        self.var = np.ones(shape, dtype=np.float32)
        self.count = epsilon

    def update(self, x):
        batch_mean = np.mean(x, axis=0)
        batch_var = np.var(x, axis=0)
        batch_count = x.shape[0]
        
        delta = batch_mean - self.mean
        total_count = self.count + batch_count
        
        self.mean = self.mean + delta * batch_count / total_count
        m_a = self.var * self.count
        m_b = batch_var * batch_count
        M2 = m_a + m_b + np.square(delta) * self.count * batch_count / total_count
        self.var = M2 / total_count
        self.count = total_count

# Improved observation processor with lap tracking
class ObservationProcessor:
    def __init__(self, num_beams=270):
        self.num_beams = num_beams
        self.obs_rms = RunningMeanStd(shape=(num_beams + 5,))
        self.is_training = True
        
        # Track waypoints for lap detection
        self.waypoints = []
        self.last_waypoint_idx = 0
        self.lap_progress = 0.0
        self.lap_completed = False
        self.previous_progress = 0.0
        
        # Waypoint collection status
        self.waypoints_finalized = False
        self.min_laps_for_waypoints = 2
        self.waypoint_spacing = 25.0  # Meters between waypoints
        
        # Lap time tracking
        self.current_lap_steps = 0
        self.best_lap_steps = float('inf')
        self.lap_count = 0
        self.current_lap_time = 0.0
        self.best_lap_time = float('inf')
        
        # Start/finish line detection
        self.start_line_pos = None
        self.lap_almost_complete = False  # Flag to track when we're almost done with a lap
        
        # Waypoint reward tracking
        self.last_reached_waypoint = -1  # Index of the last waypoint reached for incremental rewards
        self.waypoint_reached = False    # Flag indicating if a new waypoint was reached in this step
        self.closest_waypoint_dist = float('inf')  # Distance to closest waypoint

    def process_obs(self, obs, update_stats=True):
        """Process and normalize the raw observation"""
        # Extract scan data - downsample but keep enough points
        scan = obs['scans'][0][::4]  # Downsample LiDAR from 1080 to 270
        
        # Clip extremely large scan values and replace inf with large value
        scan = np.clip(scan, 0.0, 30.0)  
        scan[~np.isfinite(scan)] = 30.0
        
        # Extract other values
        x = np.array([obs['poses_x'][0]])
        y = np.array([obs['poses_y'][0]])
        theta = np.array([obs['poses_theta'][0]])
        v_x = np.array([obs['linear_vels_x'][0]])
        v_y = np.array([obs['linear_vels_y'][0]])
        
        # Record position for waypoint collection and lap tracking
        pos = (x[0], y[0])
        
        # Set start line position on first call if not set
        if self.start_line_pos is None:
            self.start_line_pos = pos
        
        # Collect waypoints if not finalized
        if not self.waypoints_finalized:
            self._collect_waypoint(pos)
        
        # Reset waypoint reached flag
        self.waypoint_reached = False
        
        # Check if we've reached a new waypoint
        if len(self.waypoints) > 0:
            # Find closest waypoint and distance
            closest_idx, closest_dist = self._find_closest_waypoint(pos)
            
            # Store closest waypoint distance for reward function
            self.closest_waypoint_dist = closest_dist
            
            # Check if we've reached a new waypoint (using index and proximity)
            # Don't mark as reached if it's the same as last time
            WAYPOINT_REACHED_THRESHOLD = 0.5  # 50cm threshold to consider a waypoint as reached
            if closest_dist < WAYPOINT_REACHED_THRESHOLD and closest_idx != self.last_reached_waypoint:
                self.waypoint_reached = True
                self.last_reached_waypoint = closest_idx
        
        # Update lap progress and check for completion
        if self.waypoints_finalized and len(self.waypoints) > 10:
            self._update_lap_progress(pos)
        else:
            # Simple progress estimation when waypoints aren't available
            # Calculate distance from start position to give some sense of progress
            dist_from_start = np.sqrt((pos[0] - self.start_line_pos[0])**2 + 
                                     (pos[1] - self.start_line_pos[1])**2)
            
            # This is a very rough estimate - higher when far from start
            self.lap_progress = min(0.5, dist_from_start / 20.0)  # Assume track length ~40m for scaling
        
        # Concatenate all observation components
        flat_obs = np.concatenate([scan, x, y, theta, v_x, v_y])
        
        # Update normalization stats during training
        if update_stats and self.is_training:
            self.obs_rms.update(flat_obs.reshape(1, -1))
        
        # Normalize the observation
        obs_mean = self.obs_rms.mean
        obs_var = self.obs_rms.var
        normalized_obs = (flat_obs - obs_mean) / np.sqrt(obs_var + 1e-8)
        
        return torch.tensor(normalized_obs, dtype=torch.float32).to(device)
        
    def _find_closest_waypoint(self, pos):
        """Find the closest waypoint and its distance"""
        closest_idx = -1
        closest_dist = float('inf')
        
        for i, waypoint in enumerate(self.waypoints):
            dist = np.sqrt((pos[0] - waypoint[0])**2 + (pos[1] - waypoint[1])**2)
            if dist < closest_dist:
                closest_dist = dist
                closest_idx = i
                
        return closest_idx, closest_dist

    def _collect_waypoint(self, pos):
        """Collect waypoints at regular intervals"""
        if len(self.waypoints) == 0:
            self.waypoints.append(pos)
            return
            
        # Check distance to last waypoint
        last_waypoint = self.waypoints[-1]
        dist = np.sqrt((pos[0] - last_waypoint[0])**2 + (pos[1] - last_waypoint[1])**2)
        
        # Add waypoint if it's sufficiently distant from the last one
        if dist > self.waypoint_spacing:
            self.waypoints.append(pos)
            
        # Check if we've returned close to the first waypoint after collecting many points
        # This helps detect when we've come full circle on the track
        if len(self.waypoints) > 20:  # Need enough waypoints first
            first_waypoint = self.waypoints[0]
            dist_to_first = np.sqrt((pos[0] - first_waypoint[0])**2 + (pos[1] - first_waypoint[1])**2)
            
            # If close to first waypoint and we've moved significantly around track
            if dist_to_first < 2.0 and not self.lap_almost_complete:
                self.lap_almost_complete = True
            elif dist_to_first > 5.0 and self.lap_almost_complete:
                # We've moved away from start line after being close to it
                self.lap_almost_complete = False

    def _update_lap_progress(self, pos):
        """Update lap progress using collected waypoints"""
        # Find closest waypoint
        closest_idx = -1
        closest_dist = float('inf')
        
        for i, waypoint in enumerate(self.waypoints):
            dist = np.sqrt((pos[0] - waypoint[0])**2 + (pos[1] - waypoint[1])**2)
            if dist < closest_dist:
                closest_dist = dist
                closest_idx = i
        
        # Calculate progress (0.0 to 1.0)
        total_waypoints = len(self.waypoints)
        new_progress = closest_idx / total_waypoints
        
        # Detect lap completion (crossing from high progress to low progress)
        if self.previous_progress > 0.85 and new_progress < 0.15:
            # We've wrapped around to the beginning
            self.lap_completed = True
            self.lap_count += 1
            
            # Check if this is a faster lap
            if self.current_lap_time < self.best_lap_time and self.current_lap_time > 30.0:
                self.best_lap_time = self.current_lap_time
        else:
            self.lap_completed = False
        
        # Update progress tracking
        self.lap_progress = new_progress
        self.previous_progress = new_progress
        self.current_lap_steps += 1
        
    def finalize_waypoints(self):
        """Mark waypoints as finalized and perform cleanup"""
        if not self.waypoints_finalized and len(self.waypoints) > 20:
            # Clean up waypoints before finalizing
            
            # Find closest waypoint to the starting position
            start_waypoint_idx = 0
            min_dist = float('inf')
            
            for i, waypoint in enumerate(self.waypoints):
                dist = np.sqrt((waypoint[0] - self.start_line_pos[0])**2 + 
                               (waypoint[1] - self.start_line_pos[1])**2)
                if dist < min_dist:
                    min_dist = dist
                    start_waypoint_idx = i
            
            # Reorganize waypoints to start from the start line
            self.waypoints = self.waypoints[start_waypoint_idx:] + self.waypoints[:start_waypoint_idx]
            
            # Now mark as finalized
            self.waypoints_finalized = True
            self.lap_progress = 0.0
            self.previous_progress = 0.0
            
            print(f"\n✅ Waypoints finalized! Collected {len(self.waypoints)} waypoints.")
            return True
        return False


## Sample Action from Policy

In [6]:
def sample_action_and_logprob(actor, obs_flat, deterministic=False, training_progress=0.0):
    mu, std = actor(obs_flat)
    
    # Add exploration bias in early training stages
    # Gradually add more throttle during initial exploration
    if not deterministic and training_progress < 0.2:  # First 20% of training
        # Boost the throttle action to encourage faster exploration
        exploration_boost = torch.zeros_like(mu)
        # Throttle is the second dimension (index 1)
        exploration_boost[1] = 0.3 * (1.0 - training_progress * 5)  # Linearly reduce from 0.3 to 0
        mu = mu + exploration_boost
    
    if deterministic:
        action = mu
        # We'll still compute log_prob for API consistency
        dist = Normal(mu, std)
        log_prob = dist.log_prob(action).sum(dim=-1)
        entropy = dist.entropy().sum(dim=-1)
    else:
        dist = Normal(mu, std)
        action = dist.sample()
        log_prob = dist.log_prob(action).sum(dim=-1)
        entropy = dist.entropy().sum(dim=-1)
    
    # Ensure action is within valid bounds
    action = torch.clamp(action, 
                        min=actor.action_bounds['low'],
                        max=actor.action_bounds['high'])
    
    return action, log_prob, entropy, dist

## Reward Function

In [7]:
def compute_reward(obs, prev_obs, action, step_reward, done, processor):
    """Calculate reward based on driving performance and lap completion"""
    # Extract key values
    # x = obs['poses_x'][0]
    # y = obs['poses_y'][0]
    # theta = obs['poses_theta'][0]
    v_x = obs['linear_vels_x'][0]
    v_y = obs['linear_vels_y'][0]
    ang_vel_z = obs['ang_vels_z'][0]
    steering = action[0, 0]
    # throttle = action[0, 1]
    min_scan = np.min(obs['scans'][0])
    collision = obs['collisions'][0]
    
    # Speed related reward - encourage higher speeds with stronger incentives
    speed = np.hypot(v_x, v_y)
    
    # More aggressive speed reward structure
    speed_reward = 1.0 * speed  # Doubled base speed reward
    
    # Progressive speed reward that scales up as car goes faster
    if speed > 2.0:
        speed_reward += 1.0 * (speed - 2.0)  # Additional bonus for speed over 2.0
    if speed > 4.0:
        speed_reward += 1.0 * (speed - 4.0)  # Even more reward for speeds over 4.0
    
    # Waypoint-based reward - provide short-term incentives
    waypoint_reward = 0.0
    
    # Give a reward when reaching a new waypoint
    if processor.waypoint_reached:
        waypoint_reward = 10.0  # Significant reward for reaching a new waypoint
    
    # Additional distance-based incentive towards closest waypoint
    # This creates a continuous gradient to guide the car
    WAYPOINT_DISTANCE_SCALE = 5.0  # Increased from 5.0 to compensate for removed progress reward
    waypoint_distance_reward = 0.0
    
    # Only apply if we have waypoints
    if len(processor.waypoints) > 0:
        # Inverse distance reward (closer = higher reward)
        # Clipped to avoid huge rewards when very close
        dist = max(0.1, processor.closest_waypoint_dist)  # Prevent division by zero
        waypoint_distance_reward = WAYPOINT_DISTANCE_SCALE / dist
    
    # Penalize going off track or too close to walls
    wall_penalty = 0.0
    SAFE_DISTANCE = 0.3  # 30cm from walls is danger zone
    if min_scan < SAFE_DISTANCE:
        # Exponential penalty that increases as car gets closer to walls
        wall_penalty = 5.0 * (SAFE_DISTANCE - min_scan) / SAFE_DISTANCE
    
    # Heavy collision penalty
    collision_penalty = 1000.0 if collision == 1 else 0.0
    
    # Penalize excessive steering for smoother driving
    steering_penalty = 0.5 * abs(steering)
    
    # Penalize excessive angular velocity (spinning)
    spin_penalty = 0.5 * abs(ang_vel_z)
    
    # Smaller step penalty - don't penalize exploration as much
    step_penalty = 0.05  # Reduced from 0.1
    
    # Lap completion rewards
    lap_reward = 0.0
    
    # Check for lap completion via environment's done flag
    # If environment says we're done but no collision, we've completed the required laps
    if done and collision == 0:
        lap_reward = 250.0  # Significant reward for completing required laps
        
        # Finalize waypoints if we've collected enough laps
        if processor.lap_count >= processor.min_laps_for_waypoints and not processor.waypoints_finalized:
            processor.finalize_waypoints()
    
    # Check for lap completion via processor's detection (only if waypoints are finalized)
    if processor.waypoints_finalized and processor.lap_completed:
        lap_reward += 100.0  # Reward for completing a lap via waypoint detection
        
        # Bonus for better lap time
        if processor.current_lap_time < processor.best_lap_time and processor.current_lap_time > 30.0:
            processor.best_lap_time = processor.current_lap_time
            lap_reward += 100.0  # Bonus for better lap time
            
        # Reset lap timer
        processor.current_lap_time = 0.0
    
    # Progress reward based on track position (only if waypoints are finalized)
    progress_position_reward = 5.0 * processor.lap_progress if processor.waypoints_finalized else 0.0
    
    # Calculate total reward
    reward = (
        speed_reward +
        waypoint_reward +
        waypoint_distance_reward +
        progress_position_reward +
        lap_reward -
        wall_penalty -
        collision_penalty -
        steering_penalty -
        spin_penalty -
        step_penalty
    )
    
    # Clip reward to prevent extreme values
    # reward = np.clip(reward, -100.0, 500.0)
    
    # For debugging
    reward_components = {
        'speed': speed_reward,
        'waypoint': waypoint_reward + waypoint_distance_reward,
        'position': progress_position_reward,
        'lap': lap_reward,
        'wall': -wall_penalty,
        'collision': -collision_penalty,
        'steering': -steering_penalty,
        'spin': -spin_penalty,
        'step': -step_penalty,
        'total': reward,
        'lap_time': processor.current_lap_time
    }
    
    return reward, reward_components
    

## Collect Rollout with Custom Reward

In [8]:
def collect_rollout(env, actor, critic, processor, max_steps=2000, render=False, deterministic=False, episode_num=0, total_episodes=5000):
    """Collect a training episode with the current policy"""
    obs_list, act_list, logprob_list, value_list, reward_list, reward_components_list = [], [], [], [], [], []
    
    # Reset environment with fixed starting position
    start_x = 0.0
    start_y = 0.0
    start_theta = -0.6524  # Fixed heading to ensure car starts in the right direction
    
    obs, step_reward, done, info = env.reset(poses=np.array([[start_x, start_y, start_theta]]))
    
    # Reset processor lap tracking for this episode
    processor.lap_completed = False
    processor.current_lap_steps = 0
    processor.current_lap_time = 0.0
    
    # Episode-specific variables
    done = False
    steps = 0
    prev_obs = obs  # Store previous observation for reward calculation
    
    # Calculate training progress for exploration rate
    training_progress = episode_num / total_episodes
    
    while not done and steps < max_steps:
        obs_flat = processor.process_obs(obs)
        with torch.no_grad():
            value = critic(obs_flat)
            action, log_prob, _, _ = sample_action_and_logprob(actor, obs_flat, deterministic, training_progress)
        
        obs_list.append(obs_flat.detach())
        act_list.append(action.detach())
        logprob_list.append(log_prob.detach())
        value_list.append(value.detach())
        
        action_np = action.cpu().numpy().reshape(1, -1)
        obs, step_reward, done, info = env.step(action_np)
        
        # Update lap time using step_reward from environment
        processor.current_lap_time += step_reward
        
        # Calculate reward
        reward, reward_components = compute_reward(obs, prev_obs, action_np, step_reward, done, processor)
        
        reward_list.append(reward)
        reward_components_list.append(reward_components)
        
        # Update for next iteration
        prev_obs = obs
        
        steps += 1
        
        if render:
            env.render(mode='human_fast')
    
    return obs_list, act_list, logprob_list, value_list, reward_list, reward_components_list, steps

## Compute Returns and Advantages

In [9]:
def compute_returns_and_advantages(rewards, values, gamma=0.99, lam=0.95):
    """Calculate GAE advantages and returns"""
    rewards = torch.tensor(rewards, dtype=torch.float32).to(device)
    values = torch.cat(values).squeeze().to(device)
    
    returns = torch.zeros_like(rewards)
    advantages = torch.zeros_like(rewards)
    
    next_value = 0
    next_advantage = 0
    
    for t in reversed(range(len(rewards))):
        delta = rewards[t] + gamma * next_value - values[t]
        advantages[t] = delta + gamma * lam * next_advantage
        returns[t] = advantages[t] + values[t]
        next_value = values[t]
        next_advantage = advantages[t]
    
    # Normalize advantages for more stable learning
    advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-8)
    
    return returns.detach(), advantages.detach()

## PPO Update Step

In [10]:
def ppo_update(actor, critic, optimizer, obs_batch, act_batch, old_logprobs, returns, advantages,
              clip_eps=0.2, vf_coeff=0.5, ent_coeff=0.01, max_grad_norm=0.5):
    """Update actor and critic networks using PPO algorithm"""
    obs_batch = torch.stack(obs_batch).to(device)
    act_batch = torch.stack(act_batch).to(device)
    old_logprobs = torch.stack(old_logprobs).to(device)
    returns = returns.to(device)
    advantages = advantages.to(device)
    
    # Calculate batch size based on rollout length
    batch_size = len(obs_batch)
    minibatch_size = min(64, batch_size)
    num_updates = max(10, batch_size // minibatch_size)
    
    total_loss = 0
    total_policy_loss = 0
    total_value_loss = 0
    total_entropy = 0
    
    for _ in range(num_updates):
        # Randomly sample minibatch
        idx = torch.randperm(batch_size)[:minibatch_size]
        
        mb_obs = obs_batch[idx]
        mb_acts = act_batch[idx]
        mb_old_logprobs = old_logprobs[idx]
        mb_returns = returns[idx]
        mb_advantages = advantages[idx]
        
        # Get current policy distribution
        mu, std = actor(mb_obs)
        dist = Normal(mu, std)
        new_logprobs = dist.log_prob(mb_acts).sum(dim=-1)
        entropy = dist.entropy().sum(dim=-1)
        
        # PPO policy loss
        ratio = torch.exp(new_logprobs - mb_old_logprobs)
        surr1 = ratio * mb_advantages
        surr2 = torch.clamp(ratio, 1 - clip_eps, 1 + clip_eps) * mb_advantages
        policy_loss = -torch.min(surr1, surr2).mean()
        
        # Value function loss
        value_pred = critic(mb_obs).squeeze()
        value_loss = F.mse_loss(value_pred, mb_returns)
        
        # Total loss
        loss = policy_loss + vf_coeff * value_loss - ent_coeff * entropy.mean()
        
        # Perform update
        optimizer.zero_grad()
        loss.backward()
        
        # Gradient clipping for stability
        nn.utils.clip_grad_norm_(list(actor.parameters()) + list(critic.parameters()), max_grad_norm)
        
        optimizer.step()
        
        # Accumulate loss values
        total_loss += loss.item()
        total_policy_loss += policy_loss.item()
        total_value_loss += value_loss.item()
        total_entropy += entropy.mean().item()
    
    # Return average losses
    n = num_updates
    return total_loss/n, total_policy_loss/n, total_value_loss/n, total_entropy/n

## Training Loop

In [11]:
def train_ppo(env_config='Austin_map.yaml', num_episodes=50000, save_interval=25, render_interval=20):
    """Train the PPO agent on the F1TENTH environment"""
    # Load environment configuration
    with open(env_config) as file:
        conf_dict = yaml.load(file, Loader=yaml.FullLoader)
    
    if conf_dict is None:
        raise ValueError("⚠️ YAML file is empty or malformed!")
    
    conf = Namespace(**conf_dict)
    
    # Create environment
    env = gym.make('f110_gym:f110-v0', map=conf.map_path, map_ext=conf.map_ext, num_agents=1)
    
    # Initialize environment
    init_poses = np.array([[0.0, 0.0, 0.0]])
    obs, _, _, _ = env.reset(poses=init_poses)
    
    # Create observation processor
    processor = ObservationProcessor()
    
    # Process observation to get dimensions
    flat_obs = processor.process_obs(obs)
    obs_dim = flat_obs.shape[0]
    act_dim = 2  # Steering and acceleration
    
    # Create actor and critic networks
    actor = ActorNet(obs_dim, act_dim, hidden_dim=256).to(device)
    critic = CriticNet(obs_dim, hidden_dim=256).to(device)
    
    # Create optimizer with learning rate schedule
    initial_lr = 3e-4
    optimizer = torch.optim.Adam(
        list(actor.parameters()) + list(critic.parameters()), 
        lr=initial_lr
    )
    
    # Learning rate scheduler
    lr_scheduler = torch.optim.lr_scheduler.LambdaLR(
        optimizer, 
        lambda epoch: max(1.0 - epoch / num_episodes, 0.1)
    )
    
    # Training metrics
    reward_records = []
    episode_length_records = []
    lap_records = []
    best_reward = -np.inf
    best_lap_time = float('inf')
    
    print("✅ Starting PPO training...")
    for episode in tqdm(range(num_episodes), desc="Training PPO"):
        # Set render flag
        render_flag = (episode % render_interval == 0)
        
        # Collect rollout
        rollout = collect_rollout(env, actor, critic, processor, render=render_flag, 
                                 episode_num=episode, total_episodes=num_episodes)
        obs_list, act_list, logprob_list, value_list, reward_list, reward_components_list, steps = rollout
        
        # Compute returns and advantages
        returns, advantages = compute_returns_and_advantages(reward_list, value_list)
        
        # Update policy
        loss, p_loss, v_loss, entropy = ppo_update(
            actor, critic, optimizer,
            obs_list, act_list, logprob_list,
            returns, advantages
        )
        
        # Update learning rate
        lr_scheduler.step()
        
        # Record metrics
        total_reward = sum(reward_list)
        reward_records.append(total_reward)
        episode_length_records.append(steps)
        lap_records.append(processor.lap_count)
        
        # Save best lap time model
        if processor.best_lap_time < best_lap_time and processor.best_lap_time < float('inf'):
            best_lap_time = processor.best_lap_time
            torch.save({
                'actor': actor.state_dict(),
                'critic': critic.state_dict(),
                'obs_rms_mean': processor.obs_rms.mean,
                'obs_rms_var': processor.obs_rms.var,
                'best_lap_time': best_lap_time,
                'waypoints': processor.waypoints,
                'waypoints_finalized': processor.waypoints_finalized,
            }, "best_laptime_model.pt")
            print(f"\n🏆 New best lap time: {best_lap_time:.2f} seconds! Model saved.")
        
        # Save best reward model
        if total_reward > best_reward:
            best_reward = total_reward
            torch.save({
                'actor': actor.state_dict(),
                'critic': critic.state_dict(),
                'obs_rms_mean': processor.obs_rms.mean,
                'obs_rms_var': processor.obs_rms.var,
                'best_reward': best_reward,
                'waypoints': processor.waypoints,
                'waypoints_finalized': processor.waypoints_finalized,
            }, "best_reward_model.pt")
        
        # Save checkpoint periodically
        # if episode % save_interval == 0:
        #     torch.save({
        #         'actor': actor.state_dict(),
        #         'critic': critic.state_dict(),
        #         'obs_rms_mean': processor.obs_rms.mean,
        #         'obs_rms_var': processor.obs_rms.var,
        #         'optimizer': optimizer.state_dict(),
        #         'episode': episode,
        #         'reward_records': reward_records,
        #         'lap_records': lap_records,
        #         'waypoints': processor.waypoints,
        #         'waypoints_finalized': processor.waypoints_finalized,
        #     }, f"ppo_checkpoint_{episode}.pt")
        
        # Print detailed info periodically
        if episode % 50 == 0:
            avg_reward = np.mean(reward_records[-10:]) if len(reward_records) >= 10 else np.mean(reward_records)
            avg_steps = np.mean(episode_length_records[-10:]) if len(episode_length_records) >= 10 else np.mean(episode_length_records)
            
            # Print status update
            # print(f"\nEpisode {episode}: Reward={total_reward:.2f}, Steps={steps}, Laps={processor.lap_count}")
            # print(f"Waypoints: {len(processor.waypoints)} collected, Finalized: {processor.waypoints_finalized}")
            # print(f"Avg Reward (10): {avg_reward:.2f}, Avg Steps: {avg_steps:.1f}")
            
            if processor.best_lap_time < float('inf'):
                print(f"Best lap time: {processor.best_lap_time:.2f} seconds")
            
            if reward_components_list:
                # Show average reward components
                components = {k: 0 for k in reward_components_list[0].keys()}
                for comp in reward_components_list:
                    for k, v in comp.items():
                        components[k] += v
                n = len(reward_components_list)
                for k, v in components.items():
                    components[k] = v / n
                
                # print("Avg Reward Components:")
                # print(f"  Speed: {components['speed']:.2f}")
                # print(f"  Waypoint: {components['waypoint']:.2f}")
                # print(f"  Position: {components['position']:.2f}")
                # print(f"  Lap: {components['lap']:.2f}")
                # print(f"  Collision: {components['collision']:.2f}")
                # print(f"  Wall: {components['wall']:.2f}")
    
    # Plot training curves and statistics
    plt.figure(figsize=(15, 10))
    
    plt.subplot(3, 2, 1)
    plt.plot(reward_records)
    plt.xlabel("Episode")
    plt.ylabel("Total Reward")
    plt.title("Training Reward Curve")
    plt.grid(True)
    
    plt.subplot(3, 2, 2)
    plt.plot(episode_length_records)
    plt.xlabel("Episode")
    plt.ylabel("Episode Length")
    plt.title("Episode Length")
    plt.grid(True)
    
    window_size = 10
    smoothed_rewards = [np.mean(reward_records[max(0, i-window_size):i+1]) for i in range(len(reward_records))]
    
    plt.subplot(3, 2, 3)
    plt.plot(smoothed_rewards)
    plt.xlabel("Episode")
    plt.ylabel("Smoothed Reward (Window=10)")
    plt.title("Smoothed Reward Curve")
    plt.grid(True)
    
    plt.subplot(3, 2, 4)
    plt.plot(lap_records)
    plt.xlabel("Episode")
    plt.ylabel("Laps Completed")
    plt.title("Laps Completed")
    plt.grid(True)
    
    # Add additional plots for debugging
    # Plot the last few episodes' rewards more closely
    last_n = min(50, len(reward_records))
    plt.subplot(3, 2, 5)
    plt.plot(range(len(reward_records)-last_n, len(reward_records)), reward_records[-last_n:])
    plt.xlabel("Episode")
    plt.ylabel("Reward")
    plt.title(f"Last {last_n} Episodes Reward")
    plt.grid(True)
    
    # Plot lap completion rate if any laps completed
    if max(lap_records) > 0:
        plt.subplot(3, 2, 6)
        lap_differences = [lap_records[i] - lap_records[i-1] if i > 0 else lap_records[i] for i in range(len(lap_records))]
        window_size = min(10, len(lap_differences))
        smoothed_lap_rate = [np.mean(lap_differences[max(0, i-window_size):i+1]) for i in range(len(lap_differences))]
        plt.plot(smoothed_lap_rate)
        plt.xlabel("Episode")
        plt.ylabel("Lap Completion Rate")
        plt.title("Smoothed Lap Completion Rate")
        plt.grid(True)
    
    plt.tight_layout()
    plt.savefig("training_curves.png")
    plt.show()
    
    # Print additional summary statistics 
    print("\n===== Training Summary =====")
    print(f"Total Episodes: {num_episodes}")
    print(f"Best Reward: {best_reward:.2f}")
    if best_lap_time < float('inf'):
        print(f"Best Lap Time: {best_lap_time:.2f} seconds")
    print(f"Total Laps Completed: {processor.lap_count}")
    print(f"Waypoints Collected: {len(processor.waypoints)}")
    print(f"Waypoints Finalized: {processor.waypoints_finalized}")
    
    return actor, critic, processor


## Plot Rewards

In [12]:
def render_policy(env, actor, processor, max_steps=1000):
    """Render the trained policy and evaluate performance"""
    # Switch processor to evaluation mode
    processor.is_training = False
    
    # Reset environment
    obs, step_reward, done, info = env.reset(poses=np.array([[0.0, 0.0, 0.0]]))
    
    # Reset lap tracking
    processor.lap_completed = False
    processor.current_lap_time = 0.0
    
    total_reward = 0
    step = 0
    lap_times = []
    
    print("\n🏁 Starting evaluation run...")
    
    while not done and step < max_steps:
        # Process observation
        obs_flat = processor.process_obs(obs, update_stats=False)
        
        # Get action from policy (deterministic)
        with torch.no_grad():
            action, _, _, _ = sample_action_and_logprob(actor, obs_flat, deterministic=True)
        
        # Take step in environment
        action_np = action.cpu().numpy().reshape(1, -1)
        next_obs, step_reward, done, info = env.step(action_np)
        
        # Update lap time
        processor.current_lap_time += step_reward
        
        # Calculate reward (for display only)
        reward, components = compute_reward(next_obs, obs, action_np, step_reward, done, processor)
        total_reward += reward
        
        # Check if lap was completed this step
        if processor.lap_completed:
            lap_times.append(processor.current_lap_time)
            print(f"🏎️ Lap {len(lap_times)} completed in {processor.current_lap_time:.2f} seconds")
            processor.current_lap_time = 0.0
        
        # Update for next iteration
        obs = next_obs
        step += 1
        
        # Print occasional status
        # if step % 100 == 0:
        #     speed = np.hypot(obs['linear_vels_x'][0], obs['linear_vels_y'][0])
        #     print(f"Step {step}: Speed = {speed:.2f} m/s, Progress = {processor.lap_progress:.2f}")
        
        # Render environment
        env.render()
        time.sleep(0.1)  # Slow down rendering for better visualization
    
    # Summary
    print(f"\n✅ Evaluation completed after {step} steps with total reward {total_reward:.2f}")
    print(f"Completed {processor.lap_count} laps")
    
    if lap_times:
        print("\nLap times:")
        for i, time in enumerate(lap_times):
            print(f"  Lap {i+1}: {time:.2f} seconds")
        print(f"  Best lap: {min(lap_times):.2f} seconds")
    
    # Switch processor back to training mode
    processor.is_training = True
    
    return lap_times


In [13]:
def load_model(checkpoint_path, obs_dim, act_dim):
    """Load a trained model from checkpoint file"""
    checkpoint = torch.load(checkpoint_path)
    
    actor = ActorNet(obs_dim, act_dim).to(device)
    critic = CriticNet(obs_dim).to(device)
    
    actor.load_state_dict(checkpoint['actor'])
    critic.load_state_dict(checkpoint['critic'])
    
    # Create processor with saved normalization parameters
    processor = ObservationProcessor()
    processor.obs_rms.mean = checkpoint['obs_rms_mean']
    processor.obs_rms.var = checkpoint['obs_rms_var']
    
    # Load waypoints if available
    if 'waypoints' in checkpoint:
        processor.waypoints = checkpoint['waypoints']
    if 'waypoints_finalized' in checkpoint:
        processor.waypoints_finalized = checkpoint['waypoints_finalized']
    
    print(f"Loaded model from {checkpoint_path}")
    print(f"Waypoints: {len(processor.waypoints)} loaded, Finalized: {processor.waypoints_finalized}")
    
    if 'best_lap_time' in checkpoint:
        print(f"Best lap time in checkpoint: {checkpoint['best_lap_time']:.2f} seconds")
    if 'best_reward' in checkpoint:
        print(f"Best reward in checkpoint: {checkpoint['best_reward']:.2f}")
    
    return actor, critic, processor


In [None]:
if __name__ == "__main__":
    # Command line arguments could be added here
    
    # Set random seeds for reproducibility
    torch.manual_seed(42)
    np.random.seed(42)
    
    # Train or load model
    TRAIN_NEW_MODEL = True  # Set to False to load a saved model
    
    if TRAIN_NEW_MODEL:
        print("Training new model...")
        actor, critic, processor = train_ppo(num_episodes=100000)
    else:
        # Load environment to get observation dimensions
        with open('Austin_map.yaml') as file:
            conf_dict = yaml.load(file, Loader=yaml.FullLoader)
        conf = Namespace(**conf_dict)
        env = gym.make('f110_gym:f110-v0', map=conf.map_path, map_ext=conf.map_ext, num_agents=1)
        obs, _, _, _ = env.reset(poses=np.array([[0.0, 0.0, 0.0]]))
        
        # Create processor to get observation dimensions
        temp_processor = ObservationProcessor()
        flat_obs = temp_processor.process_obs(obs)
        obs_dim = flat_obs.shape[0]
        act_dim = 2
        
        # Load model
        model_path = "best_laptime_model.pt"  # Change to desired model
        actor, critic, processor = load_model(model_path, obs_dim, act_dim)
    
    # Create environment for rendering
    with open('Austin_map.yaml') as file:
        conf_dict = yaml.load(file, Loader=yaml.FullLoader)
    conf = Namespace(**conf_dict)
    env = gym.make('f110_gym:f110-v0', map=conf.map_path, map_ext=conf.map_ext, num_agents=1)
    
    # Render and evaluate trained policy
    lap_times = render_policy(env, actor, processor)
    
    print("✅ Evaluation complete.")

Training new model...




✅ Starting PPO training...


Training PPO:   0%|          | 0/100000 [00:00<?, ?it/s]2025-04-09 02:01:59.736 Python[61464:11808343] ApplePersistenceIgnoreState: Existing state will not be touched. New state will be written to /var/folders/8j/tyq2b18j2w320hyzqh054kr80000gn/T/org.python.python.savedState
2025-04-09 02:02:00.476 Python[61464:11808343] +[IMKClient subclass]: chose IMKClient_Modern
2025-04-09 02:02:00.592 Python[61464:11808343] +[IMKInputSession subclass]: chose IMKInputSession_Modern
Training PPO:   0%|          | 385/100000 [00:34<3:15:01,  8.51it/s]