# Phase 2: Trajectory Prediction with Auxiliary Depth Estimation

# 🧭 Introduction

"""
Welcome to **Phase 2** of the DLAV Projec! 🚗💨

In this phase, you'll work with a more challenging dataset that includes:
- RGB **camera images**
- Ground-truth **depth maps**
- Ground-truth **semantic segmentation** labels

Your goal is still to predict the **future trajectory** of the self-driving car (SDC), but you now have more tools at your disposal! 🎯

Here, we provide an example where **depth estimation** is used as an auxiliary task to improve trajectory prediction.

However, you're **free to explore** other auxiliary tasks (e.g., using semantic labels), different loss functions, data augmentations, or better architectures! 💡

This notebook will walk you through loading the dataset, building a model, training with and without the auxiliary task, and visualizing results.
"""

In [None]:
# Install gdown to handle Google Drive file download
!pip install -q gdown

import gdown
import zipfile

download_url = f"https://drive.google.com/uc?id=1YkGwaxBKNiYL2nq--cB6WMmYGzRmRKVr"
output_zip = "dlav_train.zip"
gdown.download(download_url, output_zip, quiet=False)  # Downloads the file to your drive
with zipfile.ZipFile(output_zip, 'r') as zip_ref:  # Extracts the downloaded zip file
    zip_ref.extractall(".")

download_url = "https://drive.google.com/uc?id=1wtmT_vH9mMUNOwrNOMFP6WFw6e8rbOdu"
output_zip = "dlav_val.zip"
gdown.download(download_url, output_zip, quiet=False)
with zipfile.ZipFile(output_zip, 'r') as zip_ref:
    zip_ref.extractall(".")

download_url = "https://drive.google.com/uc?id=1G9xGE7s-Ikvvc2-LZTUyuzhWAlNdLTLV"
output_zip = "dlav_test_public.zip"
gdown.download(download_url, output_zip, quiet=False)
with zipfile.ZipFile(output_zip, 'r') as zip_ref:
    zip_ref.extractall(".")

Downloading...
From (original): https://drive.google.com/uc?id=1YkGwaxBKNiYL2nq--cB6WMmYGzRmRKVr
From (redirected): https://drive.google.com/uc?id=1YkGwaxBKNiYL2nq--cB6WMmYGzRmRKVr&confirm=t&uuid=ecb60021-fd3d-4ac4-bba8-389fab1397f1
To: /content/dlav_train.zip
100%|██████████| 439M/439M [00:09<00:00, 45.8MB/s]
Downloading...
From (original): https://drive.google.com/uc?id=1wtmT_vH9mMUNOwrNOMFP6WFw6e8rbOdu
From (redirected): https://drive.google.com/uc?id=1wtmT_vH9mMUNOwrNOMFP6WFw6e8rbOdu&confirm=t&uuid=efa1314a-c3a1-4c58-8880-ec29c1fb1128
To: /content/dlav_val.zip
100%|██████████| 87.8M/87.8M [00:02<00:00, 35.8MB/s]
Downloading...
From (original): https://drive.google.com/uc?id=1G9xGE7s-Ikvvc2-LZTUyuzhWAlNdLTLV
From (redirected): https://drive.google.com/uc?id=1G9xGE7s-Ikvvc2-LZTUyuzhWAlNdLTLV&confirm=t&uuid=7c2fde8e-756b-4fb2-8caf-bb846d15cca5
To: /content/dlav_test_public.zip
100%|██████████| 86.6M/86.6M [00:02<00:00, 37.8MB/s]


### Various imports

In [None]:
import torch
import pickle
import os
import copy
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch.nn as nn
import torch.optim as optim

from google.colab import drive
from torchvision import transforms
from torchvision import models
from torchvision.models import ResNet18_Weights
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torch.optim.lr_scheduler import StepLR

## 📂 The Dataset

We are now working with a richer dataset that includes not just images and trajectories,
but also **depth maps** (and semantic segmentation labels, though unused in this example).

The data is stored in `.pkl` files and each file contains:
- `camera`: RGB image (shape: H x W x 3)
- `sdc_history_feature`: the past trajectory of the car
- `sdc_future_feature`: the future trajectory to predict
- `depth`: ground truth depth map (shape: H x W x 1)

We'll define a `DrivingDataset` class to load and return these tensors in a format our model can work with.

In [None]:
import os
import torch
import pickle
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
import matplotlib.pyplot as plt
from torch.utils.data import Dataset, DataLoader
import csv
import random
seed = 42
torch.manual_seed(seed)
np.random.seed(seed)
random.seed(seed)

class DrivingDataset(Dataset):
    def __init__(self, file_list, test=False,val=False,augment=False):
        self.samples = file_list
        self.test = test
        self.val = val
        self.augment=augment

        self.transform1 = transforms.Compose([
            transforms.Resize((224, 224)),  # Resize to 224x224
        ])

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        # Load pickle file
        with open(self.samples[idx], 'rb') as f:
            data = pickle.load(f)

        # Convert numpy arrays to tensors
        camera = torch.FloatTensor(data['camera']).permute(2, 0, 1)
        camera = self.transform1(camera)
        history = torch.FloatTensor(data['sdc_history_feature'])
        command = data['driving_command']
        command_map = {'left':0,'right':1,'forward': 2}
        command = torch.tensor(command_map[command])
        depth = torch.FloatTensor(data['depth'])
        depth=depth.permute(2,0,1)

        if not self.test:
          future = torch.FloatTensor(data['sdc_future_feature'])

        if self.augment and (random.random() < 0.5) and (not self.test):
          camera = torch.flip(camera, dims=[2])
          history[0,:] = -history[0,:]
          future[0,:] = -future[0,:]

        if not self.test:
          return {
            'camera': camera,
            'history': history,
            'command': command,
            'future': future,
            'depth': depth
          }
        else:
          return {
            'camera': camera,
            'history': history,
            'command': command,
            'depth': depth
          }

In [None]:
class DrivingDataset2(Dataset):
    """
    Data loader with additional data processing
    """
    def __init__(self, file_list, test=False,val=False,augment=False):
        self.samples = file_list
        self.test = test
        self.val = val
        self.augment=augment

        self.transform1 = transforms.Compose([
            transforms.Resize((224, 224)),  # Resize to 224x224
        ])

        # Image transform
        self.img_transform = transforms.Compose([
            transforms.Resize((224, 224)),
            transforms.ColorJitter(brightness=0.2, contrast=0.3, saturation=0.2, hue=0.1) if augment else transforms.Lambda(lambda x: x),
            transforms.RandomRotation(degrees=10) if augment else transforms.Lambda(lambda x: x),
            transforms.RandomResizedCrop(size=(224, 224), scale=(0.9, 1.1)) if augment else transforms.Lambda(lambda x: x),
            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
        ])

        # Depth transform
        self.depth_transform = transforms.Compose([
            transforms.Resize((224, 224)),
            transforms.Lambda(lambda x: x / x.max())
        ])

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        # Load pickle file
        with open(self.samples[idx], 'rb') as f:
            data = pickle.load(f)

        # Convert numpy arrays to tensors (/255?)
        camera = torch.FloatTensor(data['camera']).permute(2, 0, 1)
        camera = self.img_transform(camera)

        depth = torch.FloatTensor(data['depth']).permute(2, 0, 1)
        depth = self.depth_transform(depth)

        # Optional depth augmentation
        if self.augment and not self.test:
            if random.random() < 0.3:
                depth *= torch.rand(1) * 0.2 + 0.9
            if random.random() < 0.3:
                depth += torch.randn_like(depth) * 0.01

        history = torch.FloatTensor(data['sdc_history_feature'])
        history = (history - history.mean()) / (history.std() + 1e-8)

        command = data['driving_command']
        command_map = {'left':0,'right':1,'forward': 2}
        command = torch.tensor(command_map[command])

        if not self.test:
          future = torch.FloatTensor(data['sdc_future_feature'])
          future = (future - future.mean()) / (future.std() + 1e-8)

        # --- Data Augmentation ---
        if self.augment and not self.test:
            # 1. Random flip (x & yaw)
            if random.random() < 0.5:
                camera = torch.flip(camera, dims=[2])
                depth = torch.flip(depth, dims=[2])
                history[:, 0] = -history[:, 0]
                history[:, 2] = -history[:, 2]
                if future is not None:
                    future[:, 0] = -future[:, 0]
                    future[:, 2] = -future[:, 2]

            # 2. Smooth trajectory (x & y)
            if random.random() < 0.5:
                kernel = torch.ones(1, 1, 5) / 5.0
                for i in range(2):
                    history[:, i] = F.conv1d(history[:, i].unsqueeze(0).unsqueeze(0),
                                             kernel.to(history.device),
                                             padding=2).squeeze(0).squeeze(0)

            # 3. Speed perturbation (x, y)
            if random.random() < 0.3:
                speed_noise = torch.randn(history[:, :2].shape) * 0.05
                history[:, :2] += speed_noise

            # 4. Yaw jitter
            if random.random() < 0.3:
                yaw_noise = torch.randn(history[:, 2].shape) * 0.1
                history[:, 2] += yaw_noise


        if not self.test:
          return {
            'camera': camera,
            'history': history,
            'command': command,
            'future': future,
            'depth': depth
          }
        else:
          return {
            'camera': camera,
            'history': history,
            'command': command,
            'depth': depth
          }

In [None]:
class Logger:
    # In this case, the logger was not used
    def __init__(self):
        # Placeholder for potential future configs (e.g., log_dir, wandb_enabled, etc.)
        pass

    def log(self, step=None, **metrics):
        """
        Logs the given metrics.

        Args:
            step (int, optional): The current step or epoch. Useful for tracking.
            **metrics: Arbitrary keyword arguments representing metric names and values.
        """
        prefix = f"[Step {step}] " if step is not None else ""
        metric_str = " | ".join(f"{k}: {v}" for k, v in metrics.items())
        # print(prefix + metric_str)

### Architecture modules

In [None]:
class CommandEncoder(nn.Module):
    """
    Pipeline for the driving_command input.
    Simple learnable embedding layer.
    Per batch;  string input
                [1xembed_dim] tensor output
    """
    def __init__(self,embed_dim=32):
        super().__init__()
        self.embedding = nn.Embedding(num_embeddings=3, embedding_dim=embed_dim)

    def forward(self, command):
        return self.embedding(command)

In [None]:
class CameraEncoder(nn.Module):
    """
    Pipeline for the camera input.
    Use ResNet18 architecture, 18-layer deep CNN. First weights are freezed.
    Per batch;  [224x224x3] tensor RGB camera input
                [1xoutput_dim] tensor output
    """
    def __init__(self,output_dim=256):
        super().__init__()

        # Load pretrained ResNet18, freeze first layer and modify last to match dimensions
        resnet = models.resnet18(weights=ResNet18_Weights.IMAGENET1K_V1)
        for name, param in resnet.named_parameters():
            if name.startswith("conv1") or name.startswith("bn1") or name.startswith("layer1"):
                param.requires_grad = False

        self.backbone = nn.Sequential(*list(resnet.children())[:-1])
        self.fc = nn.Linear(512,output_dim)

    def forward(self, x):
        features = self.backbone(x)
        features = features.view(x.size(0), -1)  # [B,512]
        out = self.fc(features)               # [B,output_dim]
        return out

In [None]:
class CameraEncoder2(nn.Module):
    """
    Pipeline for the camera input.
    Use ResNet18 architecture, 18-layer deep CNN. First weights are freezed.
    Per batch;  [224x224x3] tensor RGB camera input
                [1xoutput_dim] tensor before and after flattening output
    """
    def __init__(self,output_dim=256):
        super().__init__()

        # Load pretrained ResNet18, freeze first layer and modify last to match dimensions
        resnet = models.resnet18(weights=ResNet18_Weights.IMAGENET1K_V1)
        for name, param in resnet.named_parameters():
            if name.startswith("conv1") or name.startswith("bn1") or name.startswith("layer1"):
                param.requires_grad = False

        self.backbone = nn.Sequential(*list(resnet.children())[:-1])
        self.fc = nn.Linear(512,output_dim)

    def forward(self, x):
        features1 = self.backbone(x)
        features = features1.view(x.size(0), -1)  # [B,512]
        out = self.fc(features)               # [B,output_dim]
        return out,features1    # extract features before flattening

In [None]:
class HistoryEncoder(nn.Module):
    """
    Pipeline for the sdc_history_feature input.
    Use Transformer blocks, output the last timestep.
    Per batch;  [21x3] tensor position history input
                [1xd_model] tensor output
    """
    def __init__(self,d_model=128,nhead=4,num_layers=2,dropout=0.1,seq_len=21):
        super().__init__()
        self.pos_embedding = nn.Parameter(torch.randn(seq_len, d_model))  # Learned positional embedding
        self.input_proj = nn.Linear(3, d_model)
        encoder_layer = nn.TransformerEncoderLayer(d_model,
                                                   nhead,
                                                   dim_feedforward=4*d_model,
                                                   dropout=dropout)
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers)

    def forward(self, history):
        x = self.input_proj(history)  # [B,21,d_model]
        x = x + self.pos_embedding
        x = x.permute(1, 0, 2)
        out = self.transformer(x)
        return out[-1]  # last timestep; [B,d_model]

In [None]:
class DepthEncoder(nn.Module):
    """
    Pipeline for the depth map input.
    Use several layers of CNNs.
    Per batch;  [1x200x300] depth map tensor input.
                [1xoutput_dim] tensor output.
    """
    def __init__(self, output_dim=256):
        super().__init__()
        self.encoder = nn.Sequential(
            nn.Conv2d(1, 16, kernel_size=5, stride=2, padding=2),  # -> (B, 16, 100, 150)
            nn.BatchNorm2d(16),
            nn.ReLU(),

            nn.Conv2d(16, 32, kernel_size=3, stride=2, padding=1),  # -> (B, 32, 50, 75)
            nn.BatchNorm2d(32),
            nn.ReLU(),

            nn.Conv2d(32, 64, kernel_size=3, stride=2, padding=1),  # -> (B, 64, 25, 38)
            nn.BatchNorm2d(64),
            nn.ReLU(),

            nn.AdaptiveAvgPool2d((1, 1))  # -> (B, 64, 1, 1)
        )

        self.fc = nn.Linear(64, output_dim)  # → (B, output_dim)

    def forward(self, x):  # x shape: (B, 1, 200, 300)
        x = self.encoder(x)        # -> (B, 64, 1, 1)
        x = x.view(x.size(0), -1)  # -> (B, 64)
        x = self.fc(x)             # -> (B, output_dim)
        return x

In [None]:
class DepthDecoder(nn.Module):
    """
    Predicts the depth map from the RGB processing for the auxiliary loss calculation.
    Use several layers of CNN.
    Per batch;  [input_channelsxinput_channels] processed RGB input tensor (features extracted by CameraEncoder)
                [200,300] prediction of the depth map output
    """
    def __init__(self, input_channels=512, output_size=(200, 300)):
        super().__init__()
        self.decoder = nn.Sequential(
            nn.ConvTranspose2d(input_channels, 256, kernel_size=3, stride=2, padding=1, output_padding=1),
            nn.ReLU(),
            nn.ConvTranspose2d(256, 64, kernel_size=3, stride=2, padding=1, output_padding=1),
            nn.ReLU(),
            nn.Conv2d(64, 1, kernel_size=3, padding=1),
            nn.Upsample(size=output_size, mode='bilinear', align_corners=False)
        )

    def forward(self, x):
        return self.decoder(x)  # Output shape: (B, 1, H, W)

In [None]:
class SimpleDecoder(nn.Module):
    """
    Decode the 3 input pipeline results.
    Use concatenation and fully connected layer.
    Per batch;  processed camera, history and command input (respectively [1,camera_dim], [1,history_dim], [1,command_dim])
                [60,3] future position predictions
    """
    def __init__(self, input_dim=416, output_dim=3, sequence_length=60):
        super(SimpleDecoder, self).__init__()
        self.fc = nn.Linear(input_dim, output_dim*sequence_length)

    def forward(self, camera_feat, history_feat, command_feat):

        combined_feat = torch.cat([camera_feat, history_feat, command_feat], dim=-1)  # [B,input_dim]
        future_positions_flat = self.fc(combined_feat)  # [B,60*3]
        future_positions = future_positions_flat.view(-1,60,3)

        return future_positions  # [B,60,3]

In [None]:
class AttentionDecoder(nn.Module):
    """
    Decode the 3 input pipeline results.
    Use cross-attention blocks.
    Per batch;  processed camera, history and command input (respectively [1,camera_dim], [1,history_dim], [1,command_dim])
                [60,3] future position predictions
    """
    def __init__(self, output_dim=3, sequence_length=60,d_latent=128,d_camera=256):
        super().__init__()
        self.d_latent=d_latent

        self.encoder_layer = nn.TransformerEncoderLayer(d_model=d_latent, nhead=4, batch_first=True)
        self.encoder = nn.TransformerEncoder(self.encoder_layer, num_layers=1)
        self.fc = nn.Linear(d_latent, output_dim*sequence_length)

        self.proj_cam = nn.Linear(d_camera, self.d_latent)
        self.proj_hist = nn.Linear(128,self.d_latent)
        self.proj_com = nn.Linear(32,self.d_latent)

    def forward(self, camera_feat, history_feat, command_feat):

        camera_feat=self.proj_cam(camera_feat)
        history_feat=self.proj_hist(history_feat)
        command_feat=self.proj_com(command_feat)

        x = torch.stack([camera_feat, history_feat, command_feat], dim=1)
        attn_out = self.encoder(x)  # [B,3,d_model]
        fused_feat = attn_out.mean(dim=1)  # [B,d_model]
        predictions = self.fc(fused_feat)
        future_pos = predictions.view(-1, 60, 3)


        return future_pos  # [B, 60, 3]

In [None]:
class AttentionDecoderV2(nn.Module):
    """
    Decode the 4 input pipeline results.
    Use cross-attention blocks.
    Per batch;  processed camera, history and command input (respectively [1,camera_dim], [1,history_dim], [1,command_dim])
                [60,3] future position predictions
    """
    def __init__(self, output_dim=3, sequence_length=60,d_latent=128,dim_camera=256):
        super().__init__()
        self.d_latent=d_latent

        self.encoder_layer = nn.TransformerEncoderLayer(d_model=d_latent, nhead=4, batch_first=True)
        self.encoder = nn.TransformerEncoder(self.encoder_layer, num_layers=1)
        self.fc = nn.Linear(d_latent, output_dim*sequence_length)

        self.proj_cam = nn.Linear(dim_camera, self.d_latent)
        self.proj_hist = nn.Linear(128,self.d_latent)
        self.proj_com = nn.Linear(32,self.d_latent)
        self.proj_dep = nn.Linear(256,d_latent)

    def forward(self, camera_feat, history_feat, command_feat, depth_feat):

        camera_feat=self.proj_cam(camera_feat)
        history_feat=self.proj_hist(history_feat)
        command_feat=self.proj_com(command_feat)
        depth_feat=self.proj_dep(depth_feat)

        x = torch.stack([camera_feat, history_feat, command_feat, depth_feat], dim=1)
        attn_out = self.encoder(x)  # [B,4,d_model]
        fused_feat = attn_out.mean(dim=1)  # [B,d_model]
        predictions = self.fc(fused_feat)
        future_pos = predictions.view(-1, 60, 3)


        return future_pos  # [B, 60, 3]

In [None]:
class DrivingPlanner(nn.Module):
    """
    End-to-end planner, combine all previously defined modules.
    When initialized, input the 3 or 4 features latent dimensions, dropout rate and decoder version.
    """
    def __init__(self,camera_dim,history_dim,command_dim,dropout,version,d_latent=128):
        super().__init__()

        self.version=version

        self.camera_encoder=CameraEncoder(output_dim=camera_dim)
        self.history_encoder=HistoryEncoder(d_model=history_dim,dropout=dropout)
        self.command_encoder=CommandEncoder(embed_dim=command_dim)

        if version==1:
          self.decoder=SimpleDecoder()
        elif version==2:
            self.decoder=AttentionDecoder()
        elif version==3:
            self.decoder=AttentionDecoderV2(dim_camera=camera_dim,d_latent=d_latent)
            self.depth_encoder=DepthEncoder()
        elif version==4:
            self.decoder=AttentionDecoder(d_camera=camera_dim)
            self.aux_decoder=DepthDecoder()
            self.camera_encoder=CameraEncoder2(output_dim=camera_dim)

    def forward(self, camera, history, command, depth):

        camera_feat = self.camera_encoder(camera)
        history_feat = self.history_encoder(history)
        command_feat = self.command_encoder(command)

        if self.version==3:
          depth_feat=self.depth_encoder(depth)
          future = self.decoder(camera_feat, history_feat, command_feat, depth_feat)
          return future

        # Combine features
        elif self.version==4:
          untouched=camera_feat[1]
          flatten=camera_feat[0]
          future = self.decoder(flatten, history_feat, command_feat)
          depth_pred=self.aux_decoder(untouched)
          return future,depth_pred


        else:
          future=self.decoder(camera_feat,history_feat,command_feat)
          return future

### Training

In [None]:
def train(model, train_loader, val_loader, optimizer, logger, num_epochs=50,lambda_loss=0.1, scheduler=None):
    """
    Training routing with auxiliary loss
    """
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = model.to(device)

    best_model=None
    best_ADE=10e10

    criterion1 = nn.MSELoss()
    criterion2 = nn.L1Loss()

    for epoch in range(num_epochs):
        # Training
        model.train()
        train_loss1 = 0
        train_loss2 = 0
        train_loss = 0
        for idx, batch in enumerate(train_loader):
            camera = batch['camera'].to(device)
            history = batch['history'].to(device)
            future = batch['future'].to(device)
            command = batch['command'].to(device)
            depth = batch['depth'].to(device)         # ADDED

            optimizer.zero_grad()
            pred_future, pred_depth = model(camera,history,command,depth)
            loss1 = criterion1(pred_future[..., :2], future[..., :2])
            loss2 = criterion2(pred_depth,depth)
            loss=loss1+lambda_loss*loss2
            loss.backward()
            optimizer.step()

            if idx % 10 == 0:
                logger.log(step=epoch * len(train_loader) + idx, loss=loss.item())
            train_loss += loss.item()
            train_loss1 += loss1.item()
            train_loss2 += loss2.item()
            
        if scheduler!=None:
            scheduler.step()

        # Validation
        model.eval()
        val_loss,val_loss1,val_loss2, ade_all, fde_all = 0,0,0, [], []
        with torch.no_grad():
            for batch in val_loader:
                camera = batch['camera'].to(device)
                history = batch['history'].to(device)
                future = batch['future'].to(device)
                command = batch['command'].to(device)
                depth = batch['depth'].to(device)         # ADDED

                pred_future,pred_depth = model(camera, history, command,depth)
                loss1 = criterion1(pred_future[..., :2], future[..., :2])
                loss2 = criterion2(pred_depth,depth)
                loss=loss1+lambda_loss*loss2
                ADE = torch.norm(pred_future[:, :, :2] - future[:, :, :2], p=2, dim=-1).mean()
                FDE = torch.norm(pred_future[:, -1, :2] - future[:, -1, :2], p=2, dim=-1).mean()
                ade_all.append(ADE.item())
                fde_all.append(FDE.item())
                val_loss += loss.item()
                val_loss1 += loss1.item()
                val_loss2 += loss2.item()


        # Save best model
        ADE=np.mean(ade_all)
        if ADE<best_ADE:
              best_ADE=ADE
              best_model=copy.deepcopy(model.state_dict())

        print(f'Epoch {epoch+1}/{num_epochs} | Train Loss: {train_loss/len(train_loader):.3f}={train_loss1/len(train_loader):.2f}+{lambda_loss*train_loss2/len(train_loader):.2f} | Val Loss: {val_loss/len(val_loader):.3f}={val_loss1/len(val_loader):.2f}+{lambda_loss*val_loss2/len(val_loader):.2f} | ADE: {np.mean(ade_all):.4f} | FDE: {np.mean(fde_all):.4f} | Best ADE: {best_ADE:.4f}')

    return best_model,best_ADE


def train2(model, train_loader, val_loader, optimizer, logger, num_epochs=50, scheduler=None):
    """
    Training routing without auxiliary loss
    """
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = model.to(device)

    best_model=None
    best_ADE=10e10

    criterion = nn.MSELoss()

    for epoch in range(num_epochs):
        # Training
        model.train()
        train_loss = 0
        for idx, batch in enumerate(train_loader):
            camera = batch['camera'].to(device)
            history = batch['history'].to(device)
            future = batch['future'].to(device)
            command = batch['command'].to(device)
            depth = batch['depth'].to(device)         # ADDED

            optimizer.zero_grad()
            pred_future = model(camera,history,command,depth)
            loss = criterion(pred_future[..., :2], future[..., :2])
            loss.backward()
            optimizer.step()


            if idx % 10 == 0:
                logger.log(step=epoch * len(train_loader) + idx, loss=loss.item())
            train_loss += loss.item()

        if scheduler!=None:
            scheduler.step()

        # Validation
        model.eval()
        val_loss, ade_all, fde_all = 0, [], []
        with torch.no_grad():
            for batch in val_loader:
                camera = batch['camera'].to(device)
                history = batch['history'].to(device)
                future = batch['future'].to(device)
                command = batch['command'].to(device)
                depth = batch['depth'].to(device)         # ADDED

                pred_future = model(camera, history, command, depth)
                loss = criterion(pred_future, future)
                ADE = torch.norm(pred_future[:, :, :2] - future[:, :, :2], p=2, dim=-1).mean()
                FDE = torch.norm(pred_future[:, -1, :2] - future[:, -1, :2], p=2, dim=-1).mean()
                ade_all.append(ADE.item())
                fde_all.append(FDE.item())
                val_loss += loss.item()


        # Save best model
        ADE=np.mean(ade_all)
        if ADE<best_ADE:
              best_ADE=ADE
              best_model=copy.deepcopy(model.state_dict())

        print(f'Epoch {epoch+1}/{num_epochs} | Train Loss: {train_loss/len(train_loader):.4f} | Val Loss: {val_loss/len(val_loader):.4f} | ADE: {np.mean(ade_all):.4f} | FDE: {np.mean(fde_all):.4f} | Best ADE: {best_ADE:.4f}')

    return best_model,best_ADE

In [None]:
train_data_dir = "train"
val_data_dir = "val"

train_files = [os.path.join(train_data_dir, f) for f in os.listdir(train_data_dir) if f.endswith('.pkl')]
val_files = [os.path.join(val_data_dir, f) for f in os.listdir(val_data_dir) if f.endswith('.pkl')]

train_dataset = DrivingDataset2(train_files,augment=True)
val_dataset = DrivingDataset2(val_files,val=True)

train_loader = DataLoader(train_dataset, batch_size=32, num_workers=2, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, num_workers=2)

model = DrivingPlanner(camera_dim=1024,history_dim=128,command_dim=32,dropout=0.1,version=3,d_latent=256)

optimizer = optim.Adam(model.parameters(), lr=1e-4)
scheduler = StepLR(optimizer, step_size=40, gamma=0.1)

logger = Logger()

print('Initialization successful, starting training')

best_model,best_ADE=train2(model, train_loader, val_loader, optimizer, logger, scheduler,num_epochs=100)

Since everything was run on Scitas, the output are not available here but in the joined log file

### Model weights saving and submission generation

In [None]:
drive.mount('/content/drive')
torch.save(best_model, "drive/MyDrive/Colab Notebooks/phase2_best.pth")

In [None]:
final_model = DrivingPlanner(camera_dim=1024,history_dim=128,command_dim=32,dropout=0.1,version=3,d_latent=256)
final_model.load_state_dict(torch.load("drive/MyDrive/Colab Notebooks/phase2_best.pth"))

In [None]:
with open(f"test_public/0.pkl", "rb") as f:
    data = pickle.load(f)
print(data.keys())
# Note the absence of sdc_future_feature

dict_keys(['camera', 'depth', 'driving_command', 'sdc_history_feature', 'semantic_label'])


In [None]:
import pandas as pd
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
final_model=final_model.to(device)
test_data_dir = "test_public"
test_files = [os.path.join(test_data_dir, fn) for fn in sorted([f for f in os.listdir(test_data_dir) if f.endswith(".pkl")], key=lambda fn: int(os.path.splitext(fn)[0]))]
test_dataset = DrivingDataset(test_files, test=True)
test_loader = DataLoader(test_dataset, batch_size=250, num_workers=2)
final_model.eval()
all_plans = []
with torch.no_grad():
    for batch in test_loader:
        camera = batch['camera'].to(device)
        history = batch['history'].to(device)
        command = batch['command'].to(device)
        depth = batch['depth'].to(device)

        print(camera.shape)

        pred_future = final_model(camera,history,command,depth)
        all_plans.append(pred_future.cpu().numpy()[..., :2])
all_plans = np.concatenate(all_plans, axis=0)

# Now save the plans as a csv file
pred_xy = all_plans[..., :2]  # shape: (total_samples, T, 2)

# Flatten to (total_samples, T*2)
total_samples, T, D = pred_xy.shape
pred_xy_flat = pred_xy.reshape(total_samples, T * D)

# Build a DataFrame with an ID column
ids = np.arange(total_samples)
df_xy = pd.DataFrame(pred_xy_flat)
df_xy.insert(0, "id", ids)

# Column names: id, x_1, y_1, x_2, y_2, ..., x_T, y_T
new_col_names = ["id"]
for t in range(1, T + 1):
    new_col_names.append(f"x_{t}")
    new_col_names.append(f"y_{t}")
df_xy.columns = new_col_names

# Save to CSV
df_xy.to_csv("submission_phase2.csv", index=False)

print(f"Shape of df_xy: {df_xy.shape}")

Shape of df_xy: (1000, 121)
