In [1]:
# !pip install pip==21.0 setuptools==65.5.0
# !pip install torch torchvision torchaudio gym==0.19.0 numpy==1.23.5 matplotlib==3.5.3 tqdm 
"""
In Terminal inside venv
git clone https://github.com/f1tenth/f1tenth_gym.git
cd f1tenth_gym
pip install -e .
"""


'\nIn Terminal inside venv\ngit clone https://github.com/f1tenth/f1tenth_gym.git\ncd f1tenth_gym\npip install -e .\n'

In [2]:


import gym
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.distributions import Normal
import matplotlib.pyplot as plt
from tqdm import tqdm

import f110_gym

# Set device (prefer GPU like A100 if available)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

Using device: cpu


In [3]:
import shutil
import numba
shutil.rmtree(numba.config.CACHE_DIR, ignore_errors=True)

## Actor & Critic

In [4]:
# %%
# PPO with continuous action space uses Gaussian (Normal) policy

class ActorNet(nn.Module):
    def __init__(self, obs_dim, act_dim, hidden_dim=256):
        super().__init__()
        self.fc1 = nn.Linear(obs_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, hidden_dim)
        self.mu_head = nn.Linear(hidden_dim, act_dim)
        self.log_std = nn.Parameter(torch.zeros(act_dim))  # Learnable log std dev

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        mu = self.mu_head(x)
        std = self.log_std.exp()
        return mu, std


class CriticNet(nn.Module):
    def __init__(self, obs_dim, hidden_dim=256):
        super().__init__()
        self.fc1 = nn.Linear(obs_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, hidden_dim)
        self.v_head = nn.Linear(hidden_dim, 1)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        v = self.v_head(x)
        return v


## Observation Preprocessing

In [5]:
# %%
def flatten_observation(obs_dict):
    """
    Flatten full LIDAR scan + ego position/heading + velocity
    """
    scan = obs_dict['scans'][0]                 # (1080,)
    x = np.array([obs_dict['poses_x'][0]])
    y = np.array([obs_dict['poses_y'][0]])
    theta = np.array([obs_dict['poses_theta'][0]])
    v_x = np.array([obs_dict['linear_vels_x'][0]])
    v_y = np.array([obs_dict['linear_vels_y'][0]])

    flat_obs = np.concatenate([scan, x, y, theta, v_x, v_y])
    flat_obs = np.nan_to_num(flat_obs, nan=0.0, posinf=0.0, neginf=0.0)
    return torch.tensor(flat_obs, dtype=torch.float32).to(device)

## Evn & Network

In [6]:
env = gym.make("f110_gym:f110-v0", 
               num_agents=1, 
               map='/Users/raju0103/Desktop/College Stuff/Northeastern/RL 2/RL_Project/map/vegas',
               )

init_poses = np.array([[0.0, 0.0, 0.0]])
obs, _, _, _ = env.reset(poses=init_poses)

# Trigger proper action space setup
dummy_action = np.array([[0.0, 0.0]])
obs, _, done, _ = env.step(dummy_action)

# Flatten obs and get dimensions
flat_obs = flatten_observation(obs)
obs_dim = flat_obs.shape[0]   # should be 1084 now
act_dim = 2  # [steering, velocity]

# Create models
actor_func = ActorNet(obs_dim, act_dim).to(device)
value_func = CriticNet(obs_dim).to(device)

print("Observation dim:", obs_dim)
print("Action dim:", act_dim)



TypeError: 'NoneType' object is not subscriptable

## Action Sampling

In [7]:
# %%
def sample_action_and_logprob(actor, obs_flat):
    if torch.isnan(obs_flat).any():
        raise ValueError("obs_flat contains NaNs")

    mu, std = actor(obs_flat)
    if torch.isnan(mu).any() or torch.isnan(std).any():
        raise ValueError(f"Actor output contains NaNs: mu={mu}, std={std}")

    dist = Normal(mu, std)
    action = dist.sample()
    log_prob = dist.log_prob(action).sum()
    entropy = dist.entropy().sum()
    return action, log_prob, entropy, dist

## Rollout Collection

In [8]:
# %%
def collect_rollout(env, actor, critic, max_steps=500, render=False):
    obs_list, act_list, logprob_list, value_list, reward_list = [], [], [], [], []

    init_poses = np.array([[0.0, 0.0, 0.0]])
    obs, _, _, _ = env.reset(poses=init_poses)
    done = False
    steps = 0

    while not done and steps < max_steps:
        obs_flat = flatten_observation(obs)
        value = critic(obs_flat)
        action, log_prob, _, _ = sample_action_and_logprob(actor, obs_flat)

        # Save step info
        obs_list.append(obs_flat.detach())
        act_list.append(action.detach())
        logprob_list.append(log_prob.detach())
        value_list.append(value.detach())

        # Step
        action_np = action.cpu().numpy().reshape(1, -1)
        obs, _, done, _ = env.step(action_np)

        # Reward: encourage forward aligned motion
        v_x = obs['linear_vels_x'][0]
        v_y = obs['linear_vels_y'][0]
        heading = obs['poses_theta'][0]

        velocity_mag = np.sqrt(v_x**2 + v_y**2)
        velocity_angle = np.arctan2(v_y, v_x)
        heading_diff = velocity_angle - heading

        # Reward is projection of velocity in heading direction
        reward = velocity_mag * np.cos(heading_diff)

        # Penalty for early termination (crash or timeout)
        if done:
            reward -= 5.0

        reward_list.append(reward)
        steps += 1

        if render:
            env.render()

    return obs_list, act_list, logprob_list, value_list, reward_list

## Return & Advantage Calculation

In [9]:
def compute_returns_and_advantages(rewards, values, gamma=0.99, lam=0.95):
    """
    Compute discounted returns and Generalized Advantage Estimates (GAE).
    Returns:
    - returns: total discounted rewards
    - advantages: advantages for each timestep
    """
    rewards = torch.tensor(rewards, dtype=torch.float32).to(device)
    values = torch.stack(values).squeeze().to(device)
    returns = torch.zeros_like(rewards).to(device)
    advantages = torch.zeros_like(rewards).to(device)

    next_value = 0
    next_advantage = 0

    for t in reversed(range(len(rewards))):
        delta = rewards[t] + gamma * next_value - values[t]
        advantages[t] = delta + gamma * lam * next_advantage
        returns[t] = advantages[t] + values[t]

        next_value = values[t]
        next_advantage = advantages[t]

    return returns.detach(), advantages.detach()

## PPO Loss and Backward Pass

In [10]:

def ppo_update(actor, critic, optimizer, obs_batch, act_batch, old_logprobs, returns, advantages,
               clip_eps=0.2, vf_coeff=0.5, ent_coeff=0.01):
    """
    Performs a PPO-Clip update step using collected data.
    """
    obs_batch = torch.stack(obs_batch).to(device)
    act_batch = torch.stack(act_batch).to(device)
    old_logprobs = torch.stack(old_logprobs).to(device)
    returns = returns.to(device)
    advantages = advantages.to(device)

    # Recompute action distribution with current policy
    mu, std = actor(obs_batch)
    dist = Normal(mu, std)
    new_logprobs = dist.log_prob(act_batch).sum(dim=-1)
    entropy = dist.entropy().sum(dim=-1)

    # Policy ratio
    ratio = torch.exp(new_logprobs - old_logprobs)

    # PPO clipped surrogate objective
    unclipped = ratio * advantages
    clipped = torch.clamp(ratio, 1 - clip_eps, 1 + clip_eps) * advantages
    policy_loss = -torch.min(unclipped, clipped).mean()

    # Value loss
    values = critic(obs_batch).squeeze()
    value_loss = F.mse_loss(values, returns)

    # Total loss
    loss = policy_loss + vf_coeff * value_loss - ent_coeff * entropy.mean()

    # Optimize
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    return loss.item(), policy_loss.item(), value_loss.item(), entropy.mean().item()

## Training

In [None]:


# Hyperparameters
total_episodes = 500
gamma = 0.99
lam = 0.95
clip_eps = 0.2
vf_coeff = 0.5
ent_coeff = 0.01
lr = 1e-4

# Optimizer
optimizer = torch.optim.Adam(list(actor_func.parameters()) + list(value_func.parameters()), lr=lr)

# Logs
reward_records = []
loss_records = []

# Training loop with tqdm
progress_bar = tqdm(range(total_episodes), desc="Training PPO")
for episode in progress_bar:
    # Collect one rollout
    render_flag = (episode % 25 == 0)  # render every 25 episodes

    obs_list, act_list, logprob_list, value_list, reward_list = collect_rollout(
        env, actor_func, value_func, render=render_flag
    )

    # Compute returns and advantages
    returns, advantages = compute_returns_and_advantages(reward_list, value_list, gamma, lam)

    # PPO update
    try:
        loss, p_loss, v_loss, entropy = ppo_update(
            actor_func, value_func, optimizer,
            obs_list, act_list, logprob_list,
            returns, advantages,
            clip_eps=clip_eps, vf_coeff=vf_coeff, ent_coeff=ent_coeff
        )
    except Exception as e:
        print(f"Update failed: {e}")
        # Skip this update and continue with training
        loss, p_loss, v_loss, entropy = float('nan'), float('nan'), float('nan'), float('nan')

    # Logging
    ep_reward = sum(reward_list)
    reward_records.append(ep_reward)
    loss_records.append((loss, p_loss, v_loss, entropy))

    # Live metrics in tqdm bar
    progress_bar.set_description(
        f"Ep {episode:>3} | R: {ep_reward:>6.1f} | Loss: {loss:.3f} | Ent: {entropy:.2f}"
    )

    # Early stopping
    if episode >= 50 and np.mean(reward_records[-50:]) > 800:
        print("\n✅ Early stopping: average reward > 800.")
        break

print("✅ Training complete.")

# %%
# Save trained actor policy
torch.save(actor_func.state_dict(), "ppo_f110_actor.pt")
print("✅ Saved actor weights to 'ppo_f110_actor.pt'")

In [None]:
# Plot reward curve and 50-episode moving average
def moving_average(data, window_size=50):
    if len(data) < window_size:
        return data
    return np.convolve(data, np.ones(window_size)/window_size, mode='valid')

plt.figure(figsize=(10, 5))
plt.plot(reward_records, label='Episode Reward')
plt.plot(moving_average(reward_records), label='Moving Average (50 episodes)', linewidth=2)
plt.xlabel("Episode")
plt.ylabel("Total Reward")
plt.title("PPO Training Reward Curve")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()