In [10]:
# %pip install pip==21.0 setuptools==65.5.0
# %pip install torch torchvision torchaudio gym==0.19.0 numpy==1.23.5 matplotlib==3.5.3 tqdm
# %pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
"""
In Terminal inside venv
git clone https://github.com/f1tenth/f1tenth_gym.git
cd f1tenth_gym
pip install -e .
"""


'\nIn Terminal inside venv\ngit clone https://github.com/f1tenth/f1tenth_gym.git\ncd f1tenth_gym\npip install -e .\n'

## Imports and Setup

In [None]:
import gym
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.distributions import Normal
import matplotlib.pyplot as plt
from tqdm import tqdm
import time
import yaml
from argparse import Namespace
from collections import deque
import random

# Set device (prefer GPU if available)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")


In [None]:
SEED = 42
torch.manual_seed(SEED)
np.random.seed(SEED)
random.seed(SEED)

# Define hyperparameters
HIDDEN_DIM = 512  # Increased network capacity
LEARNING_RATE = 3e-4  # Higher learning rate with scheduling
GAMMA = 0.99  # Higher discount factor for long-term planning
LAMBDA = 0.95  # GAE parameter
CLIP_EPSILON = 0.2  # PPO clipping parameter
VF_COEFF = 0.5  # Value function loss coefficient
ENT_COEFF = 0.01  # Entropy coefficient
PPO_EPOCHS = 10  # More PPO epochs per rollout
GRAD_CLIP = 0.5  # Gradient clipping threshold
BATCH_SIZE = 64  # Mini-batch size
LR_DECAY = 0.9999  # Learning rate decay factor

## Actor and Critic Networks

In [12]:
class ActorNet(nn.Module):
    def __init__(self, obs_dim, act_dim, hidden_dim=256):
        super().__init__()
        self.fc1 = nn.Linear(obs_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, hidden_dim)
        self.mu_head = nn.Linear(hidden_dim, act_dim)

        # Learnable log_std initialized to safe value
        self.log_std_param = nn.Parameter(torch.full((act_dim,), -0.05))

    def forward(self, x):
        # x = torch.clamp(x, -1e6, 1e6)  # Clamp obs input
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        mu = self.mu_head(x)

        # Clamp log_std to avoid NaN in exp
        log_std = self.log_std_param
        std = torch.exp(log_std)

        # Final safety check
        if torch.isnan(mu).any() or torch.isnan(std).any():
            raise ValueError(f"NaNs in actor output: mu={mu}, std={std}")

        return mu, std


class CriticNet(nn.Module):
    def __init__(self, obs_dim, hidden_dim=256):
        super().__init__()
        self.fc1 = nn.Linear(obs_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, hidden_dim)
        self.out = nn.Linear(hidden_dim, 1)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        return self.out(x)

## Flatten Observation

In [13]:
def flatten_observation(obs):
    scan = obs['scans'][0][::4]  # from 1080 → 270          
    x = np.array([obs['poses_x'][0]])
    y = np.array([obs['poses_y'][0]])
    theta = np.array([obs['poses_theta'][0]])
    v_x = np.array([obs['linear_vels_x'][0]])
    v_y = np.array([obs['linear_vels_y'][0]])

    flat_obs = np.concatenate([scan, x, y, theta, v_x, v_y])
    # flat_obs = np.nan_to_num(flat_obs, nan=0.0, posinf=0.0, neginf=0.0)
    # flat_obs = np.clip(flat_obs, -1e6, 1e6)  # <-- Add this line
    return torch.tensor(flat_obs, dtype=torch.float32).to(device)


## Sample Action from Policy

In [14]:
from torch.distributions import Normal

def sample_action_and_logprob(actor, obs_flat, episode_num=None):
    mu, std = actor(obs_flat)
    dist = Normal(mu, std)
    action = dist.sample()

    # Add extra exploration noise in early episodes
    if episode_num is not None and episode_num < 1000:
        action += torch.randn_like(action) * 0.1  # or 0.1

    log_prob = dist.log_prob(action).sum()
    entropy = dist.entropy().sum()

    # action = torch.clamp(action,
    #                      min=torch.tensor([-0.5, 0.0], device=action.device),
    #                      max=torch.tensor([ 0.5, 10.0], device=action.device))
    return action, log_prob, entropy, dist

## Collect Rollout with Custom Reward

In [15]:
def collect_rollout(env, actor, critic, max_steps=10000, render=False):
    obs_list, act_list, logprob_list, value_list, reward_list = [], [], [], [], []
    obs, _, _, _ = env.reset(poses=np.array([[0.0, 0.0, -0.6524]]))
    done = False
    steps = 0
    prev_x = obs['poses_x'][0]
    prev_y = obs['poses_y'][0]

    while not done and steps < max_steps:
        obs_flat = flatten_observation(obs)
        value = critic(obs_flat)
        action, log_prob, _, _ = sample_action_and_logprob(actor, obs_flat, steps)

        obs_list.append(obs_flat.detach())
        act_list.append(action.detach())
        logprob_list.append(log_prob.detach())
        value_list.append(value.detach())

        action_np = action.cpu().numpy().reshape(1, -1)
        obs, _, done, _ = env.step(action_np)

        # Extract key values
        v_x = obs['linear_vels_x'][0]
        v_y = obs['linear_vels_y'][0]
        curr_x = obs['poses_x'][0]
        curr_y = obs['poses_y'][0]
        heading = obs['poses_theta'][0]
        ang_vel_z = obs['ang_vels_z'][0]
        steering = action_np[0, 0]
        min_scan = np.min(obs['scans'][0])
        collision = obs['collisions'][0]

        # Motion analysis
        delta_x = curr_x - prev_x
        delta_y = curr_y - prev_y
        velocity_mag = np.hypot(v_x, v_y)
        velocity_angle = np.arctan2(v_y, v_x)
        heading_diff = (velocity_angle - heading + np.pi) % (2 * np.pi) - np.pi
        progress = delta_x * np.cos(heading) + delta_y * np.sin(heading)

        # # Reward components
        # r_progress = 6.0 * max(0.0, progress)
        r_distance = 200.0 * (1 - np.exp(-50.0 * progress))
        r_velocity = 200.0 * (1 / (1 + np.exp(-5 * (velocity_mag - 0.1))) - 0.5)
        r_spin = -5 * abs(ang_vel_z)
        r_steering = -0.5 * abs(steering) * velocity_mag
        r_heading = -0.3 * abs(heading_diff)
        r_collision = -100.0 if collision == 1 else 0.0
        r_wall = -5.0 if min_scan < 0.05 else 0.0
        r_step = -0.05

        reward = (
            r_distance +
            r_velocity +
            r_spin +
            r_heading +
            r_steering +
            r_collision +
            r_wall +
            r_step
        )

        reward_list.append(reward)
        steps += 1
        prev_x, prev_y = curr_x, curr_y

        # # 👀 Debug output for first 10 steps
        # if steps >= 500 and steps <= 510:
        #     print(f"\n🧾 Step {steps} reward breakdown:")
        #     print(f"  Progress:      {r_distance:+.3f}")
        #     print(f"  Velocity:      {r_velocity:+.3f}")
        #     print(f"  Spinning:      {r_spin:+.3f}")
        #     print(f"  Steering:      {r_steering:+.3f}")
        #     # print(f"  Heading diff:  {r_heading:+.3f}")
        #     print(f"  Wall warning:  {r_wall:+.3f}")
        #     print(f"  Collision:     {r_collision:+.3f}")
        #     print(f"  Step penalty:  {r_step:+.3f}")
        #     print(f"  👉 Total:       {reward:+.3f}")

        if render:
            env.render(mode='human_fast')

    return obs_list, act_list, logprob_list, value_list, reward_list

## Compute Returns and Advantages

In [16]:
def compute_returns_and_advantages(rewards, values, gamma=0.98, lam=0.95):
    rewards = torch.tensor(rewards, dtype=torch.float32).to(device)
    values = torch.stack(values).squeeze().to(device)
    returns = torch.zeros_like(rewards)
    advantages = torch.zeros_like(rewards)

    next_value = 0
    next_advantage = 0

    for t in reversed(range(len(rewards))):
        delta = rewards[t] + gamma * next_value - values[t]
        advantages[t] = delta + gamma * lam * next_advantage
        returns[t] = advantages[t] + values[t]
        next_value = values[t]
        next_advantage = advantages[t]

    return returns.detach(), advantages.detach()


## PPO Update Step

In [17]:
def ppo_update(actor, critic, optimizer, obs_batch, act_batch, old_logprobs, returns, advantages,
               clip_eps=0.2, vf_coeff=0.5, ent_coeff=0.1):
    obs_batch = torch.stack(obs_batch).to(device)
    act_batch = torch.stack(act_batch).to(device)
    old_logprobs = torch.stack(old_logprobs).to(device)
    returns = returns.to(device)
    advantages = advantages.to(device)

    mu, std = actor(obs_batch)
    dist = Normal(mu, std)
    new_logprobs = dist.log_prob(act_batch).sum(dim=-1)
    entropy = dist.entropy().sum(dim=-1)

    ratio = torch.exp(new_logprobs - old_logprobs)
    unclipped = ratio * advantages
    clipped = torch.clamp(ratio, 1 - clip_eps, 1 + clip_eps) * advantages
    policy_loss = -torch.min(unclipped, clipped).mean()
    value_loss = F.mse_loss(critic(obs_batch).squeeze(), returns)
    loss = policy_loss + vf_coeff * value_loss - ent_coeff * entropy.mean()

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    return loss.item(), policy_loss.item(), value_loss.item(), entropy.mean().item()


## Training Loop

In [18]:
with open('Austin_map.yaml') as file:
        conf_dict = yaml.load(file, Loader=yaml.FullLoader)

if conf_dict is None:
    raise ValueError("⚠️ YAML file is empty or malformed!")

conf = Namespace(**conf_dict)

env = gym.make('f110_gym:f110-v0', map=conf.map_path, map_ext=conf.map_ext, num_agents=1)

init_poses = np.array([[0.0, 0.0, 0.0]])
print("Step 3: Resetting env...")
obs, _, _, _ = env.reset(poses=init_poses)
print("Step 4: Env reset done.")
flat_obs = flatten_observation(obs)
obs_dim = flat_obs.shape[0]
act_dim = 2

actor_func = ActorNet(obs_dim, act_dim).to(device)
value_func = CriticNet(obs_dim).to(device)
optimizer = torch.optim.Adam(list(actor_func.parameters()) + list(value_func.parameters()), lr=2e-5)

reward_records = []
best_reward = -np.inf
print("✅ Starting PPO training...")
for episode in tqdm(range(100000), desc="Training PPO"):
    render_flag = (episode % 25 == 0)
    rollout = collect_rollout(env, actor_func, value_func, render=render_flag)
    obs_list, act_list, logprob_list, value_list, reward_list = rollout
    returns, advantages = compute_returns_and_advantages(reward_list, value_list)

    for _ in range(8):  # PPO epochs per rollout
        loss, p_loss, v_loss, entropy = ppo_update(
            actor_func, value_func, optimizer,
            obs_list, act_list, logprob_list,
            returns, advantages
        )

    total_reward = sum(reward_list)
    reward_records.append(total_reward)

    if total_reward > best_reward:
        best_reward = total_reward
        torch.save(actor_func.state_dict(), "best_ppo_actor.pt")

    # print(f"Episode {episode+1}: Total Reward = {total_reward:.2f}, Best = {best_reward:.2f}")

    # if episode >= 100 and np.mean(reward_records[-100:]) > 1000:
    #     print("✅ Early stopping: stable policy learned")
    #     break



Step 3: Resetting env...
Step 4: Env reset done.
✅ Starting PPO training...


Training PPO:   1%|▏         | 1495/100000 [29:19<32:11:45,  1.18s/it] 


ValueError: NaNs in actor output: mu=tensor([[nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan],
        [nan, nan]], grad_fn=<AddmmBackward0>), std=tensor([nan, nan], grad_fn=<ExpBackward0>)

## Plot Rewards

In [None]:
plt.plot(reward_records, label="Reward")
plt.xlabel("Episode")
plt.ylabel("Total Reward")
plt.title("PPO Training Reward Curve")
plt.grid(True)
plt.legend()
plt.show()

## Render Trained Policy

In [None]:
def render_policy(env, actor, max_steps=500):
    obs, _, _, _ = env.reset(poses=np.array([[0.0, 0.0, 0.0]]))
    done = False
    for _ in range(max_steps):
        obs_flat = flatten_observation(obs)
        with torch.no_grad():
            action, _, _, _ = sample_action_and_logprob(actor, obs_flat)
        action_np = action.cpu().numpy().reshape(1, -1)
        obs, _, done, _ = env.step(action_np)
        env.render()
        if done:
            break

render_policy(env, actor_func)
print("✅ Render complete.")