PoseEstimation flow:
============================

In [None]:
# seeding

import pytorch_lightning as pl
import randomname

# get random version name before seeding
version = randomname.get_name()
seed = 22742

pl.seed_everything(seed, workers=True)

Model to use:
-------------

In [None]:
from pedestrians_video_2_carla.data.carla.skeleton import CARLA_SKELETON
DATA_NODES = CARLA_SKELETON
INPUT_NODES = CARLA_SKELETON
OUTPUT_NODES = CARLA_SKELETON


In [None]:
from pedestrians_video_2_carla.modules.flow.output_types import PoseEstimationModelOutputType
from torch import nn
from pedestrians_video_2_carla.modules.pose_estimation.pose_estimation import PoseEstimationModel



class AvPedestrianPose(PoseEstimationModel):
    """
    The simplest dummy model used to debug the flow.
    """

    def __init__(self,
                 stride=8,
                 **kwargs
                 ):
        super().__init__(
            **kwargs
        )

        self.__input_size = 3  # RGB
        self.__output_nodes_len = len(self.output_nodes)
        self.__output_size = self.__output_nodes_len

        self.pool_center = nn.AvgPool2d(kernel_size=9, stride=stride, padding=1)
        self.linear = nn.Linear(
            self.__input_size * 46 * 122,
            self.__output_nodes_len * 2
        )

    @property
    def output_type(self) -> PoseEstimationModelOutputType:
        return PoseEstimationModelOutputType.pose_2d

    @property
    def needs_confidence(self) -> bool:
        return False

    def forward(self, x, *args, **kwargs):
        b, t, c, h, w = x.shape
        x = x.view(b * t, c, h, w)
        x = self.pool_center(x)
        x = x.view(b, t, -1)
        x = self.linear(x)

        x = x.view(b, t, self.__output_size, 2)

        return x


In [None]:
from pedestrians_video_2_carla.modules.flow.output_types import PoseEstimationModelOutputType
from torch import nn
from pedestrians_video_2_carla.modules.pose_estimation.pose_estimation import PoseEstimationModel
from torchvision.models import resnet18
from torch_geometric.nn import TransformerConv


class AvPedestrianPoseTransformer(PoseEstimationModel):
    """
    The simplest dummy model used to debug the flow.
    """

    def __init__(self,
                 **kwargs
                 ):
        super().__init__(
            **kwargs
        )

        self.__output_nodes_len = len(self.output_nodes)

        resnet_backbone = resnet18(pretrained=True)
        self.reduced_resnet18 = nn.Sequential(*(list(resnet_backbone.children())[:-1]))
        
        self.linear_first = nn.Linear(512, 128)
        self.linear_second = nn.Linear(128, self.__output_nodes_len * 2)


        self.encoder_layer = nn.TransformerEncoderLayer(
            d_model=self.__output_nodes_len * 2,
            nhead=4,
            batch_first=True
        )
        self.encoder = nn.TransformerEncoder(self.encoder_layer, num_layers=6)


        self.trans_0 = TransformerConv(
            in_channels=self.__output_nodes_len * 2,
            out_channels=self.__output_nodes_len * 2,
            heads=4,
            bias=True
            )
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=0.5)
        self.linear_after_transformer = nn.Linear(self.__output_nodes_len * 2, self.__output_nodes_len * 2)



    @property
    def output_type(self) -> PoseEstimationModelOutputType:
        return PoseEstimationModelOutputType.pose_2d

    @property
    def needs_confidence(self) -> bool:
        return False

    def forward(self, x, *args, **kwargs):
        b, t, c, h, w = x.shape
        x = x.view(b * t, c, h, w)

        # change here
        x = self.reduced_resnet18(x)
        x = x.view(b, t, -1)

        x = self.linear_first(x)
        x = self.dropout(x)
        x = self.relu(x)

        x = self.linear_second(x)
        x = self.dropout(x)
        x = self.relu(x)
 
        orig_shape = x.shape
        x = x.view(orig_shape[0], orig_shape[1], -1)
        # print("input shape: ", x.shape)
        x = self.encoder(x)
        x = x.view(orig_shape)

        # to be removed
        x = self.dropout(x)
        x = self.relu(x)
        # to be removed

        x = self.linear_after_transformer(x)

        x = x.view(b, t, self.__output_nodes_len, 2)

        return x


In [None]:
from pedestrians_video_2_carla.modules.pose_estimation.unipose.unipose_lstm import UniPoseLSTM

# any model needs to inherit from `pedestrians_video_2_carla.modules.pose_estimation.pose_estimation.PoseEstimationModel`
# model = UniPoseLSTM(
#     stride=8,
#     output_stride=16,
#     backbone="resnet50",
#     input_nodes=INPUT_NODES,
#     output_nodes=OUTPUT_NODES,
#     sigma=3,
#     # standard model params:
#     movements_lr=0.0001,
#     movements_enable_lr_scheduler=True,
#     movements_scheduler_type="StepLR",
#     movements_scheduler_gamma=0.333
# )

model = AvPedestrianPoseTransformer(
    
)

In [None]:
ckpt_path=None

Flow definition:
----------------

Also handles checkpoint loading if needed.

In [None]:
# get flow model (LitPoseEstimationFlow)

from pedestrians_video_2_carla.modules.flow.pose_estimation import LitPoseEstimationFlow

if ckpt_path is not None:
    flow = LitPoseEstimationFlow.load_from_checkpoint(
        checkpoint_path=ckpt_path,
        movements_model=model,
        input_nodes=INPUT_NODES,
        output_nodes=OUTPUT_NODES,
        loss_modes=["heatmaps" if model.output_type == PoseEstimationModelOutputType.heatmaps else "loc_2d"],
    )
else:
    flow = LitPoseEstimationFlow(
        movements_model=model,
        input_nodes=INPUT_NODES,
        output_nodes=OUTPUT_NODES,
        loss_modes=["heatmaps" if model.output_type == PoseEstimationModelOutputType.heatmaps else "loc_2d"],
    )

In [None]:
# DataModule to use

from pedestrians_video_2_carla.data.carla.carla_recorded_video_datamodule import CarlaRecordedVideoDataModule
from pedestrians_video_2_carla.data.base.base_transforms import BaseTransforms

dm = CarlaRecordedVideoDataModule(
    batch_size=1,
    num_workers=2,
    clip_offset=5,
    clip_length=5,
    transforms=BaseTransforms.hips_neck_bbox,
    data_nodes=DATA_NODES,
    input_nodes=INPUT_NODES,
    needs_heatmaps=(model.output_type == PoseEstimationModelOutputType.heatmaps),
    sigma=3,  # for heatmaps generation (kernel size)
    fast_dev_run=False
)

In [None]:
# prepare data if needed - this will be done automatically if you use Trainer.fit(), but moved here explicitly for clarity

dm.prepare_data()

In [None]:
# loggers and callbacks

import os

from pytorch_lightning.callbacks import LearningRateMonitor, ModelCheckpoint, ModelSummary
from pytorch_lightning.loggers.wandb import WandbLogger

from pedestrians_video_2_carla.loggers.pedestrian import PedestrianLogger
from pedestrians_video_2_carla.loggers.pedestrian.enums import PedestrianRenderers

import wandb

os.environ["WANDB_NOTEBOOK_NAME"] = '/app/notebooks/flows/pose_estimation/av_pedestrian_pose.ipynb'

logger = WandbLogger(
    save_dir='/runs',
    name=version,
    version=version,
    project="pose_estimation",
    entity="carla-pedestrians",
    log_model=True,  # this will log models created by ModelCheckpoint,
    tags=["ipynb"],
)
log_dir = os.path.realpath(os.path.join(str(logger.experiment.dir), ".."))

pedestrian_logger = PedestrianLogger(
    save_dir=os.path.join(log_dir, "videos"),
    name=logger.name,
    version=logger.version,
    renderers=[PedestrianRenderers.source_videos, PedestrianRenderers.target_points, PedestrianRenderers.projection_points],
    source_videos_dir='/datasets/CARLA/WideCameraPedestrians',
    source_videos_overlay_bboxes=True,
    source_videos_overlay_skeletons=True,
    video_saving_frequency_reduction=0,  # turn off video by setting to 0
    max_videos=1,
    input_nodes=INPUT_NODES,
    output_nodes=OUTPUT_NODES,
)

checkpoint_callback = ModelCheckpoint(
    dirpath=os.path.join(log_dir, "checkpoints"),
    monitor="val_loss/primary",
    mode="min",
    save_top_k=1,
)
lr_monitor = LearningRateMonitor(logging_interval="step")
model_summary = ModelSummary(max_depth=3)

In [None]:
# actual trainer

dataset_fraction = 0.0005 # of 60 000 clips
# dataset_fraction = 0.5 # of 60 000 clips


trainer = pl.Trainer(
    gpus=1,
    auto_select_gpus=True,
    log_every_n_steps=1,
    num_sanity_val_steps=8,
    check_val_every_n_epoch=1,
    limit_val_batches=dataset_fraction,
    limit_train_batches=dataset_fraction,
    max_epochs=3,
    deterministic=False,
    fast_dev_run=False,
    logger=[logger, pedestrian_logger],
    callbacks=[checkpoint_callback, lr_monitor, model_summary],
)

In [None]:
# train! this will decode meta at the beginning with no visible output, it can take a few minutes

trainer.fit(model=flow, datamodule=dm, ckpt_path=ckpt_path)  # ckpt_path is needed to resume optimizer states etc.

In [None]:
wandb.finish()