In [1]:
import os
import math
import random
from dataclasses import dataclass, asdict
from pathlib import Path
from typing import Dict, Any, List, Optional

import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

from PIL import Image
import torchvision
import torchvision.transforms as T

def set_seed(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

set_seed(42)

device = torch.device(
    "cuda" if torch.cuda.is_available()
    else ("mps" if torch.backends.mps.is_available() else "cpu")
)
print("device:", device)
print("torch:", torch.__version__)
print("torchvision:", torchvision.__version__)

device: mps
torch: 2.9.1
torchvision: 0.24.1


In [2]:
CSV_PATH = "final_data_new_labels.csv"
IMAGES_ROOT = "processed_data"

OUT_DIR = Path("out_methods_minimum")
(OUT_DIR / "checkpoints").mkdir(parents=True, exist_ok=True)
(OUT_DIR / "tables").mkdir(parents=True, exist_ok=True)
(OUT_DIR / "logs").mkdir(parents=True, exist_ok=True)

assert os.path.isfile(CSV_PATH), f"CSV not found: {CSV_PATH}"
assert os.path.isdir(IMAGES_ROOT), f"Image folder not found: {IMAGES_ROOT}"

print("OUT_DIR:", OUT_DIR.resolve())

OUT_DIR: /Users/lisawang/Cornell/25Fall/AML/final/Vision-Based-Safety-Assessment-for-Pedestrian-Street-Crossing/4-dataset_and_training/out_methods_minimum


In [3]:
class PedXingDataset(Dataset):
    """
    Returns:
      x: Tensor [3, H, W]
      y: dict of targets (some keys may be absent if column not present)
    """

    def __init__(
        self,
        csv_path: str,
        images_root: str,
        subset: str,
        image_size: int = 224,
        preprocess_mode: str = "norm",   # "norm" or "no_norm"
        augment_mode: str = "none",      # "none" or "basic"
        roadway_bin_size_m: float = 5.0,
    ):
        super().__init__()
        self.df = pd.read_csv(csv_path)
        self.df = self.df[self.df["subset"] == subset].reset_index(drop=True)

        self.images_root = images_root
        self.image_size = image_size
        self.preprocess_mode = preprocess_mode
        self.augment_mode = augment_mode
        self.roadway_bin_size_m = roadway_bin_size_m

        self.col_filename = "new_filename" if "new_filename" in self.df.columns else "filename"

        if "safe_to_walk" in self.df.columns:
            self.col_safe = "safe_to_walk"
        elif "safe_to_cross" in self.df.columns:
            self.col_safe = "safe_to_cross"
        else:
            raise ValueError("Missing safety label column: safe_to_walk or safe_to_cross")

        self.col_weather = "weather" if "weather" in self.df.columns else None
        self.col_tlight = "traffic_light" if "traffic_light" in self.df.columns else None

        if "crosswalk_signal" in self.df.columns:
            self.col_psignal = "crosswalk_signal"
        elif "pedestrian_signal" in self.df.columns:
            self.col_psignal = "pedestrian_signal"
        else:
            self.col_psignal = None

        self.col_roadway = "roadway_width" if "roadway_width" in self.df.columns else None

        # binary
        self.col_crosswalk = "crosswalk" if "crosswalk" in self.df.columns else None
        self.col_car = "car" if "car" in self.df.columns else None
        self.col_scooter = "scooter" if "scooter" in self.df.columns else None
        self.col_bike = "bike" if "bike" in self.df.columns else None
        self.col_obstacles = "other_obstacles" if "other_obstacles" in self.df.columns else None

        self.transform = self._build_transform()

    def _build_transform(self):
        base = [
            T.Resize(self.image_size, antialias=True),
            T.CenterCrop(self.image_size),
        ]

        aug = []
        if self.augment_mode == "basic":
            aug = [
                T.RandomResizedCrop(self.image_size, scale=(0.85, 1.0), antialias=True),
                T.RandomHorizontalFlip(p=0.5),
            ]
        elif self.augment_mode == "none":
            aug = []
        else:
            raise ValueError("augment_mode must be 'none' or 'basic' in minimum notebook")

        to_tensor = [T.ToTensor()]
        if self.preprocess_mode == "norm":
            norm = [T.Normalize(mean=[0.485, 0.456, 0.406],
                                std=[0.229, 0.224, 0.225])]
        elif self.preprocess_mode == "no_norm":
            norm = []
        else:
            raise ValueError("preprocess_mode must be 'norm' or 'no_norm'")

        if len(aug) > 0:
            return T.Compose(aug + to_tensor + norm)
        return T.Compose(base + to_tensor + norm)

    def _roadway_width_to_bin(self, w: Any) -> int:
        try:
            w = float(w)
        except Exception:
            return -1
        if not np.isfinite(w) or w < 0:
            return -1
        return int(math.floor(w / self.roadway_bin_size_m))

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx: int):
        row = self.df.iloc[idx]
        fname = str(row[self.col_filename])
        path = os.path.join(self.images_root, fname)
        img = Image.open(path).convert("RGB")
        x = self.transform(img)

        y = {}
        y["safe_to_cross"] = torch.tensor(int(row[self.col_safe]), dtype=torch.long)

        if self.col_weather is not None:
            y["weather"] = torch.tensor(int(row[self.col_weather]), dtype=torch.long)
        if self.col_psignal is not None:
            y["pedestrian_signal"] = torch.tensor(int(row[self.col_psignal]), dtype=torch.long)
        if self.col_tlight is not None:
            y["traffic_light"] = torch.tensor(int(row[self.col_tlight]), dtype=torch.long)
        if self.col_roadway is not None:
            y["roadway_width_bin"] = torch.tensor(self._roadway_width_to_bin(row[self.col_roadway]), dtype=torch.long)

        def add_bin(col, key):
            if col is None:
                return
            y[key] = torch.tensor(int(row[col]), dtype=torch.long)

        add_bin(self.col_crosswalk, "crosswalk")
        add_bin(self.col_car, "car")
        add_bin(self.col_scooter, "scooter")
        add_bin(self.col_bike, "bike")
        add_bin(self.col_obstacles, "other_obstacles")

        return x, y

In [4]:
def make_loader(subset: str, image_size: int, preprocess_mode: str, augment_mode: str,
                batch_size: int = 16, shuffle: bool = True, roadway_bin_size_m: float = 5.0) -> DataLoader:
    ds = PedXingDataset(
        csv_path=CSV_PATH,
        images_root=IMAGES_ROOT,
        subset=subset,
        image_size=image_size,
        preprocess_mode=preprocess_mode,
        augment_mode=augment_mode,
        roadway_bin_size_m=roadway_bin_size_m,
    )
    use_pin_memory = torch.cuda.is_available()
    return DataLoader(ds, batch_size=batch_size, shuffle=shuffle, num_workers=0, pin_memory=use_pin_memory)

def infer_roadway_num_classes(train_ds: PedXingDataset) -> int:
    if train_ds.col_roadway is None:
        return 1
    bins = []
    for i in range(len(train_ds.df)):
        b = train_ds._roadway_width_to_bin(train_ds.df.iloc[i][train_ds.col_roadway])
        if b >= 0:
            bins.append(b)
    return int(max(bins) + 1) if len(bins) > 0 else 1

In [5]:
class ResNetBackbone(nn.Module):
    def __init__(self, pretrained: bool = True):
        super().__init__()
        weights = torchvision.models.ResNet18_Weights.DEFAULT if pretrained else None
        m = torchvision.models.resnet18(weights=weights, progress=False)
        self.features = nn.Sequential(*list(m.children())[:-1])
        self.feat_dim = 512

    def forward(self, x):
        z = self.features(x)
        return z.flatten(1)

class MLPHead(nn.Module):
    def __init__(self, in_dim: int, out_dim: int, hidden_dim: int = 256, dropout: float = 0.2):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(in_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim, out_dim),
        )
        for m in self.net:
            if isinstance(m, nn.Linear):
                nn.init.kaiming_normal_(m.weight)
                nn.init.zeros_(m.bias)

    def forward(self, z):
        return self.net(z)

class MultiTaskResNet(nn.Module):
    def __init__(self, roadway_num_classes: int, pretrained: bool = True, freeze_backbone: bool = False):
        super().__init__()
        self.backbone = ResNetBackbone(pretrained=pretrained)
        if freeze_backbone:
            for p in self.backbone.parameters():
                p.requires_grad = False

        d = self.backbone.feat_dim
        self.heads = nn.ModuleDict({
            "safe_to_cross": MLPHead(d, 2),
            "weather": MLPHead(d, 3),
            "pedestrian_signal": MLPHead(d, 3),
            "traffic_light": MLPHead(d, 3),
            "roadway_width_bin": MLPHead(d, roadway_num_classes),
            "crosswalk": MLPHead(d, 2),
            "car": MLPHead(d, 2),
            "scooter": MLPHead(d, 2),
            "bike": MLPHead(d, 2),
            "other_obstacles": MLPHead(d, 2),
        })

    def forward(self, x):
        z = self.backbone(x)
        return {k: head(z) for k, head in self.heads.items()}

In [6]:
def accuracy_from_logits(logits: torch.Tensor, y: torch.Tensor) -> float:
    pred = logits.argmax(dim=1)
    return (pred == y).float().mean().item()

@torch.no_grad()
def evaluate(model: nn.Module, loader: DataLoader) -> Dict[str, float]:
    model.eval()
    acc_safe = []
    for x, y in loader:
        x = x.to(device)
        y = {k: v.to(device) for k, v in y.items()}
        out = model(x)
        acc_safe.append(accuracy_from_logits(out["safe_to_cross"], y["safe_to_cross"]))
    return {"safe_acc": float(np.mean(acc_safe)) if len(acc_safe) else float("nan")}

def compute_loss(out: Dict[str, torch.Tensor], y: Dict[str, torch.Tensor], weights: Dict[str, float]) -> torch.Tensor:
    total = 0.0

    def add_ce(key: str):
        nonlocal total
        if key not in out or key not in y:
            return
        yy = y[key]
        if key == "roadway_width_bin":
            mask = yy >= 0
            if mask.sum().item() == 0:
                return
            logits = out[key][mask]
            target = yy[mask]
        else:
            logits = out[key]
            target = yy
        total = total + weights.get(key, 0.0) * F.cross_entropy(logits, target)

    # primary + a few auxiliaries (keep minimal compute)
    add_ce("safe_to_cross")
    add_ce("weather")
    add_ce("pedestrian_signal")
    add_ce("traffic_light")
    add_ce("roadway_width_bin")
    add_ce("crosswalk")
    add_ce("car")
    add_ce("scooter")
    add_ce("bike")
    add_ce("other_obstacles")

    return total

In [7]:
@dataclass
class ExpConfig:
    exp_name: str
    freeze_backbone: bool
    preprocess_mode: str       # "norm" or "no_norm"
    augment_mode: str          # "none" or "basic"
    image_size: int = 224
    pretrained: bool = True
    epochs: int = 20
    lr: float = 1e-3
    weight_decay: float = 1e-4

def run_one(cfg: ExpConfig) -> Dict[str, Any]:
    train_loader = make_loader("train", cfg.image_size, cfg.preprocess_mode, cfg.augment_mode, shuffle=True)
    val_loader = make_loader("val", cfg.image_size, cfg.preprocess_mode, "none", shuffle=False)
    test_loader = make_loader("test", cfg.image_size, cfg.preprocess_mode, "none", shuffle=False)

    roadway_num_classes = infer_roadway_num_classes(train_loader.dataset)

    model = MultiTaskResNet(
        roadway_num_classes=roadway_num_classes,
        pretrained=cfg.pretrained,
        freeze_backbone=cfg.freeze_backbone
    ).to(device)

    opt = torch.optim.AdamW([p for p in model.parameters() if p.requires_grad],
                            lr=cfg.lr, weight_decay=cfg.weight_decay)
    sched = torch.optim.lr_scheduler.CosineAnnealingLR(opt, T_max=cfg.epochs)

    weights = {
        "safe_to_cross": 1.0,
        "weather": 0.3,
        "pedestrian_signal": 0.5,
        "traffic_light": 0.5,
        "roadway_width_bin": 0.3,
        "crosswalk": 0.2,
        "car": 0.2,
        "scooter": 0.2,
        "bike": 0.2,
        "other_obstacles": 0.2,
    }

    best_val = -1.0
    ckpt_path = OUT_DIR / "checkpoints" / f"{cfg.exp_name}.pt"
    hist = []

    for epoch in range(cfg.epochs):
        model.train()
        losses = []
        for x, y in train_loader:
            x = x.to(device)
            y = {k: v.to(device) for k, v in y.items()}
            out = model(x)
            L = compute_loss(out, y, weights)

            opt.zero_grad()
            L.backward()
            nn.utils.clip_grad_norm_(model.parameters(), 5.0)
            opt.step()

            losses.append(float(L.item()))

        sched.step()

        val_m = evaluate(model, val_loader)
        row = {"exp_name": cfg.exp_name, "epoch": epoch, "train_loss": float(np.mean(losses)),
               "val_safe_acc": float(val_m["safe_acc"]), "lr": float(sched.get_last_lr()[0])}
        hist.append(row)

        if val_m["safe_acc"] > best_val:
            best_val = val_m["safe_acc"]
            torch.save({"cfg": asdict(cfg), "state_dict": model.state_dict(),
                        "roadway_num_classes": roadway_num_classes}, ckpt_path)

        print(f"[{cfg.exp_name}] epoch={epoch} val_safe_acc={val_m['safe_acc']:.4f} best={best_val:.4f}")

    # load best -> test
    ckpt = torch.load(ckpt_path, map_location=device)
    model.load_state_dict(ckpt["state_dict"])
    test_m = evaluate(model, test_loader)

    hist_df = pd.DataFrame(hist)
    hist_csv = OUT_DIR / "logs" / f"{cfg.exp_name}_history.csv"
    hist_df.to_csv(hist_csv, index=False)

    return {
        **asdict(cfg),
        "roadway_num_classes": int(ckpt["roadway_num_classes"]),
        "best_val_safe_acc": float(best_val),
        "test_safe_acc": float(test_m["safe_acc"]),
        "ckpt_path": str(ckpt_path),
        "history_csv": str(hist_csv),
    }

In [8]:
experiments = [
    # A1: supervised method 1 (linear probe) + norm + noaug
    ExpConfig(exp_name="A1_linearprobe_norm_noaug", freeze_backbone=True, preprocess_mode="norm", augment_mode="none"),
    # A2: supervised method 2 (finetune) + norm + noaug
    ExpConfig(exp_name="A2_finetune_norm_noaug", freeze_backbone=False, preprocess_mode="norm", augment_mode="none"),
    # B1: augmentation method (basic aug) + finetune + norm
    ExpConfig(exp_name="B1_finetune_norm_basicaug", freeze_backbone=False, preprocess_mode="norm", augment_mode="basic"),
    # C1: preprocessing method (no_norm) + finetune + basic aug
    ExpConfig(exp_name="C1_finetune_nonorm_basicaug", freeze_backbone=False, preprocess_mode="no_norm", augment_mode="basic"),
]

all_results = []
for cfg in experiments:
    all_results.append(run_one(cfg))

results_df = pd.DataFrame(all_results)
results_csv = OUT_DIR / "tables" / "results_summary_min.csv"
results_df.to_csv(results_csv, index=False)
print("Saved:", results_csv)

results_df[["exp_name","freeze_backbone","preprocess_mode","augment_mode","epochs","lr","best_val_safe_acc","test_safe_acc"]].sort_values(
    "test_safe_acc", ascending=False
)

[A1_linearprobe_norm_noaug] epoch=0 val_safe_acc=0.6667 best=0.6667
[A1_linearprobe_norm_noaug] epoch=1 val_safe_acc=0.6198 best=0.6667
[A1_linearprobe_norm_noaug] epoch=2 val_safe_acc=0.6823 best=0.6823
[A1_linearprobe_norm_noaug] epoch=3 val_safe_acc=0.7101 best=0.7101
[A1_linearprobe_norm_noaug] epoch=4 val_safe_acc=0.6979 best=0.7101
[A1_linearprobe_norm_noaug] epoch=5 val_safe_acc=0.5729 best=0.7101
[A1_linearprobe_norm_noaug] epoch=6 val_safe_acc=0.6823 best=0.7101
[A1_linearprobe_norm_noaug] epoch=7 val_safe_acc=0.6667 best=0.7101
[A1_linearprobe_norm_noaug] epoch=8 val_safe_acc=0.7101 best=0.7101
[A1_linearprobe_norm_noaug] epoch=9 val_safe_acc=0.6354 best=0.7101
[A1_linearprobe_norm_noaug] epoch=10 val_safe_acc=0.6667 best=0.7101
[A1_linearprobe_norm_noaug] epoch=11 val_safe_acc=0.5955 best=0.7101
[A1_linearprobe_norm_noaug] epoch=12 val_safe_acc=0.6701 best=0.7101
[A1_linearprobe_norm_noaug] epoch=13 val_safe_acc=0.6667 best=0.7101
[A1_linearprobe_norm_noaug] epoch=14 val_saf

Unnamed: 0,exp_name,freeze_backbone,preprocess_mode,augment_mode,epochs,lr,best_val_safe_acc,test_safe_acc
1,A2_finetune_norm_noaug,False,norm,none,20,0.001,0.776042,0.744792
3,C1_finetune_nonorm_basicaug,False,no_norm,basic,20,0.001,0.784722,0.717014
0,A1_linearprobe_norm_noaug,True,norm,none,20,0.001,0.710069,0.710069
2,B1_finetune_norm_basicaug,False,norm,basic,20,0.001,0.78125,0.670139


In [10]:
def save_table(df: pd.DataFrame, name: str):
    path = OUT_DIR / "tables" / f"{name}.csv"
    df.to_csv(path, index=False)
    print("Saved table:", path)
    return df

t_supervised = results_df[results_df["exp_name"].isin(["A1_linearprobe_norm_noaug","A2_finetune_norm_noaug"])][
    ["exp_name","freeze_backbone","preprocess_mode","augment_mode","epochs","lr","test_safe_acc"]
]
save_table(t_supervised, "table_supervised_methods_min")

t_preproc = results_df[results_df["exp_name"].isin(["B1_finetune_norm_basicaug","C1_finetune_nonorm_basicaug"])][
    ["exp_name","preprocess_mode","augment_mode","epochs","lr","test_safe_acc"]
]
save_table(t_preproc, "table_preprocessing_methods_min")

t_aug = results_df[results_df["exp_name"].isin(["A2_finetune_norm_noaug","B1_finetune_norm_basicaug"])][
    ["exp_name","augment_mode","epochs","lr","test_safe_acc"]
]
save_table(t_aug, "table_augmentation_methods_min")

Saved table: out_methods_minimum/tables/table_supervised_methods_min.csv
Saved table: out_methods_minimum/tables/table_preprocessing_methods_min.csv
Saved table: out_methods_minimum/tables/table_augmentation_methods_min.csv


Unnamed: 0,exp_name,augment_mode,epochs,lr,test_safe_acc
1,A2_finetune_norm_noaug,none,20,0.001,0.744792
2,B1_finetune_norm_basicaug,basic,20,0.001,0.670139
