# CommaAI calibration challenge

The goal of this project is to predict the orientation of a camera from a video.
We predict the pitch and yaw of the camera in radians at every frame of the footage.


In [None]:
!pip install av

In [45]:
import os

import numpy as np
import pytorch_lightning as pl
import torch
import torch.nn as nn
import torch.utils.data as tdata
import torch.nn.functional as F
import torchvision

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
HEIGHT = 874
WIDTH = 1164
MIN_CHUNK_LENGTH = 30

# Data exploration

In [3]:
index = 0
vframes, _, _ = torchvision.io.read_video(
    f"labeled/{index}.mp4", start_pts=0, end_pts=60, pts_unit="sec"
)
print(vframes.shape)
orientations = np.loadtxt(f"labeled/{index}.txt")
print(orientations.shape)

torch.Size([1200, 874, 1164, 3])


In [38]:
def indices_to_windows(idx: np.ndarray)->list:
    # converts a list of indices to a list of windows
    # where a window is a tuple of (start, end)
    # where start and end are the indices of the window
    # e.g. [0, 1, 2, 3, 4, 5, 6, 7, 8, 9] -> [(0, 10)]
    # e.g. [0, 1, 2, 3, 6, 7, 8, 9, 10] -> [(0, 4), (6, 11)]
    # e.g. [0, 1, 9, 10, 11, 12, 13, 17, 18, 19] -> [(0, 2), (9, 14), (17, 20)]
    if len(idx) == 0:
        return []
    jumps = np.argwhere(np.diff(idx) > 1).ravel()
    if len(jumps) == 0:
        return [(idx[0], idx[-1] + 1)]
    else:
        windows = []
        for i in range(len(jumps)):
            if i == 0:
                windows.append((idx[0], idx[jumps[i]] + 1))
            else:
                windows.append((idx[jumps[i - 1] + 1], idx[jumps[i]] + 1))

        windows.append((idx[jumps[-1] + 1], idx[-1] + 1))
        return windows
    

def nan_windows(arr: np.ndarray)->list:
    if np.all(~np.isnan(arr[:, 0])):
        return []
    else:
        nan_indices = np.argwhere(np.isnan(arr[:, 0])).ravel()
        return indices_to_windows(nan_indices)

def non_nan_windows(arr: np.ndarray)->list:
    if np.all(np.isnan(arr[:, 0])):
        return []
    else:
        non_nan_indices = np.argwhere(~np.isnan(arr[:, 0])).ravel()
        return indices_to_windows(non_nan_indices)

# print(indices_to_windows([0, 1, 2, 3, 6, 7, 8, 9, 10]))
# print(indices_to_windows([0, 1, 9, 10, 11, 12, 13, 17, 18, 19]))

[(0, 4), (6, 11)]
[(0, 2), (9, 14), (17, 20)]


In [4]:
videos = []
orientations = []
for i in range(5):
    vframes, _, _ = torchvision.io.read_video(
        f"labeled/{i}.mp4", start_pts=0, end_pts=60, pts_unit="sec"
    )
    videos.append(vframes)
    orientations.append(np.loadtxt(f"labeled/{i}.txt"))

In [44]:
for i in range(len(videos)):
    print(videos[i].shape, orientations[i].shape)

torch.Size([1200, 874, 1164, 3]) (1200, 2)
torch.Size([1200, 874, 1164, 3]) (1200, 2)
torch.Size([1200, 874, 1164, 3]) (1200, 2)
torch.Size([1200, 874, 1164, 3]) (1200, 2)
torch.Size([1196, 874, 1164, 3]) (1196, 2)


In [43]:
# load each mp4 video in labeled/ and split it in the max number of contiguous 30 frames chunks
# with non-nan orientation data
# save each chunk as a separate mp4 file as labeled_chunks/{index}_{chunk_index}.mp4
# save the orientation data for each chunk as labeled_chunks/{index}_{chunk_index}.txt
# where index is the index of the original video and chunk_index is the index of the chunk
# in the original video
if not os.path.exists('labeled_chunks'):
    os.mkdir('labeled_chunks')
for video_id in range(5):
    print(f"processing video {video_id}")
    vframes = videos[video_id]
    labels = orientations[video_id]
    windows = non_nan_windows(labels)
    chunk_id = 0
    for window in windows:
        start, end = window
        print(f"processing window {start},{end}")
        if end - start < MIN_CHUNK_LENGTH:
            print(f"skipping window {start},{end} because it's too short")
            continue
        else:
            num_chunks = (end - start) // MIN_CHUNK_LENGTH
            for i in range(num_chunks):
                chunk_start = start + i * MIN_CHUNK_LENGTH
                chunk_end = chunk_start + MIN_CHUNK_LENGTH
                chunk = vframes[chunk_start:chunk_end]
                chunk_orientations = labels[chunk_start:chunk_end]
                if np.any(np.isnan(chunk_orientations)):
                    raise ValueError("chunk has nan orientation data")

                torchvision.io.write_video(
                    f"labeled_chunks/{video_id}_{chunk_id}.mp4", chunk, fps=20, video_codec="libx264"
                )
                np.savetxt(f"labeled_chunks/{video_id}_{chunk_id}.txt", chunk_orientations)
                print(f"saved chunk {video_id}_{chunk_id} on window {chunk_start},{chunk_end}")
                chunk_id += 1


processing video 0
processing window 0,1200
saved chunk 0_0 on window 0,30
saved chunk 0_1 on window 30,60
saved chunk 0_2 on window 60,90
saved chunk 0_3 on window 90,120
saved chunk 0_4 on window 120,150
saved chunk 0_5 on window 150,180
saved chunk 0_6 on window 180,210
saved chunk 0_7 on window 210,240
saved chunk 0_8 on window 240,270
saved chunk 0_9 on window 270,300
saved chunk 0_10 on window 300,330
saved chunk 0_11 on window 330,360
saved chunk 0_12 on window 360,390
saved chunk 0_13 on window 390,420
saved chunk 0_14 on window 420,450
saved chunk 0_15 on window 450,480
saved chunk 0_16 on window 480,510
saved chunk 0_17 on window 510,540
saved chunk 0_18 on window 540,570
saved chunk 0_19 on window 570,600
saved chunk 0_20 on window 600,630
saved chunk 0_21 on window 630,660
saved chunk 0_22 on window 660,690
saved chunk 0_23 on window 690,720
saved chunk 0_24 on window 720,750
saved chunk 0_25 on window 750,780
saved chunk 0_26 on window 780,810
saved chunk 0_27 on window 81

# Data

In [None]:
class DataModule(pl.LightningDataModule):
    def __init__(self, data_dir = 'labeled_chunks', batch_size: int = 3) -> None:
        super().__init__()
        self.data_dir = data_dir
        self.batch_size = batch_size

    def setup(self, stage: str) -> None:
        filenames = []
        for filename in os.listdir(self.data_dir):
            if len(filenames) >= 10:
                break
            if filename.endswith('.mp4'):
                filenames.append(filename)
        if len(filenames) == 0:
            raise ValueError('no files')
        print(filenames)
        np.random.shuffle(filenames)
        num_train = int(len(filenames) * 0.8)
        num_val = int(len(filenames) * 0.1)
        num_test = len(filenames) - num_train - num_val
        self.train_filenames = filenames[:num_train]
        self.val_filenames = filenames[num_train:num_train + num_val]
        self.test_filenames = filenames[num_train + num_val:]
        self.train_data = torch.zeros((len(self.train_filenames), MIN_CHUNK_LENGTH, HEIGHT, WIDTH, 3))
        self.val_data = torch.zeros((len(self.val_filenames), MIN_CHUNK_LENGTH, HEIGHT, WIDTH, 3))
        self.test_data = torch.zeros((len(self.test_filenames), MIN_CHUNK_LENGTH, HEIGHT, WIDTH, 3))
        self.train_labels = torch.zeros((len(self.train_filenames), MIN_CHUNK_LENGTH, 2))
        self.val_labels = torch.zeros((len(self.val_filenames), MIN_CHUNK_LENGTH, 2))
        self.test_labels = torch.zeros((len(self.test_filenames), MIN_CHUNK_LENGTH, 2))
        for i, filename in enumerate(self.train_filenames):
            frames, _, _ = torchvision.io.read_video(
                os.path.join(self.data_dir, filename), start_pts=0, end_pts=MIN_CHUNK_LENGTH, pts_unit="pts"
            )
            self.train_data[i] = frames
            self.train_labels[i] = torch.from_numpy(np.loadtxt(os.path.join(self.data_dir, filename.replace('.mp4', '.txt'))))
        for i, filename in enumerate(self.val_filenames):
            frames, _, _ = torchvision.io.read_video(
                os.path.join(self.data_dir, filename), start_pts=0, end_pts=MIN_CHUNK_LENGTH, pts_unit="pts"
            )
            self.val_data[i] = frames
            self.val_labels[i] = torch.from_numpy(np.loadtxt(os.path.join(self.data_dir, filename.replace('.mp4', '.txt'))))
        for i, filename in enumerate(self.test_filenames):
            frames, _, _ = torchvision.io.read_video(
                os.path.join(self.data_dir, filename), start_pts=0, end_pts=MIN_CHUNK_LENGTH, pts_unit="pts"
            )
            self.test_data[i] = frames
            self.test_labels[i] = torch.from_numpy(np.loadtxt(os.path.join(self.data_dir, filename.replace('.mp4', '.txt'))))


    def train_dataloader(self) -> tdata.DataLoader:
        return tdata.DataLoader(
            tdata.TensorDataset(self.train_data, self.train_labels),
            batch_size=self.batch_size,
            pin_memory=True,
        )


    def val_dataloader(self) -> tdata.DataLoader:
        return tdata.DataLoader(
            tdata.TensorDataset(self.val_data, self.val_labels),
            batch_size=self.batch_size,
            pin_memory=True,
        )
    
    def test_dataloader(self) -> tdata.DataLoader:
        return tdata.DataLoader(
            tdata.TensorDataset(self.test_data, self.test_labels),
            batch_size=self.batch_size,
            pin_memory=True,
        )

# RNN Model

In [None]:
# Original ConvLSTM cell as proposed by Shi et al.
class ConvLSTMCell(nn.Module):
    def __init__(
        self, in_channels, out_channels, kernel_size, padding, activation, frame_size
    ):

        super(ConvLSTMCell, self).__init__()

        if activation == "tanh":
            self.activation = torch.tanh
        elif activation == "relu":
            self.activation = torch.relu

        # Idea adapted from https://github.com/ndrplz/ConvLSTM_pytorch
        self.conv = nn.Conv2d(
            in_channels=in_channels + out_channels,
            out_channels=4 * out_channels,
            kernel_size=kernel_size,
            padding=padding,
        )

        # Initialize weights for Hadamard Products
        self.W_ci = nn.Parameter(torch.Tensor(out_channels, *frame_size))
        self.W_co = nn.Parameter(torch.Tensor(out_channels, *frame_size))
        self.W_cf = nn.Parameter(torch.Tensor(out_channels, *frame_size))

    def forward(self, X, H_prev, C_prev):

        # Idea adapted from https://github.com/ndrplz/ConvLSTM_pytorch
        conv_output = self.conv(torch.cat([X, H_prev], dim=1))

        # Idea adapted from https://github.com/ndrplz/ConvLSTM_pytorch
        i_conv, f_conv, C_conv, o_conv = torch.chunk(conv_output, chunks=4, dim=1)

        input_gate = torch.sigmoid(i_conv + self.W_ci * C_prev)
        forget_gate = torch.sigmoid(f_conv + self.W_cf * C_prev)

        # Current Cell output
        C = forget_gate * C_prev + input_gate * self.activation(C_conv)

        output_gate = torch.sigmoid(o_conv + self.W_co * C)

        # Current Hidden State
        H = output_gate * self.activation(C)

        return H, C


class ConvLSTM(nn.Module):
    def __init__(
        self, in_channels, out_channels, kernel_size, padding, activation, frame_size
    ):
        super(ConvLSTM, self).__init__()

        self.out_channels = out_channels

        # We will unroll this over time steps
        self.convLSTMcell = ConvLSTMCell(
            in_channels, out_channels, kernel_size, padding, activation, frame_size
        )

    def forward(self, X):
        # X is a frame sequence (batch_size, num_channels, seq_len, height, width)

        # Get the dimensions
        batch_size, _, seq_len, height, width = X.size()

        # Initialize output
        output = torch.zeros(
            batch_size, self.out_channels, seq_len, height, width, device=DEVICE
        )

        # Initialize Hidden State
        H = torch.zeros(batch_size, self.out_channels, height, width, device=DEVICE)

        # Initialize Cell Input
        C = torch.zeros(batch_size, self.out_channels, height, width, device=DEVICE)

        # Unroll over time steps
        for time_step in range(seq_len):
            H, C = self.convLSTMcell(X[:, :, time_step], H, C)

            output[:, :, time_step] = H

        return output


class Seq2Seq(pl.LightningModule):
    def __init__(
        self,
        num_channels: int = 3,
        num_kernels: int = 64,
        kernel_size: tuple = (3, 3),
        padding: tuple = (1, 1),
        activation: str = "relu",
        frame_size: tuple = (HEIGHT, WIDTH),
        num_layers: int = 1,
    ):
        super(Seq2Seq, self).__init__()

        self.sequential = nn.Sequential()

        # Add First layer (Different in_channels than the rest)
        self.sequential.add_module(
            "convlstm1",
            ConvLSTM(
                in_channels=num_channels,
                out_channels=num_kernels,
                kernel_size=kernel_size,
                padding=padding,
                activation=activation,
                frame_size=frame_size,
            ),
        )

        self.sequential.add_module(
            "batchnorm1", nn.BatchNorm3d(num_features=num_kernels)
        )

        # Add rest of the layers
        for l in range(2, num_layers + 1):
            self.sequential.add_module(
                f"convlstm{l}",
                ConvLSTM(
                    in_channels=num_kernels,
                    out_channels=num_kernels,
                    kernel_size=kernel_size,
                    padding=padding,
                    activation=activation,
                    frame_size=frame_size,
                ),
            )

            self.sequential.add_module(
                f"batchnorm{l}", nn.BatchNorm3d(num_features=num_kernels)
            )

            # Add Convolutional Layer to predict output frame
        self.conv = nn.Conv2d(
            in_channels=num_kernels,
            out_channels=num_channels,
            kernel_size=kernel_size,
            padding=padding,
        )

    def forward(self, X):
        # Forward propagation through all the layers
        output = self.sequential(X)

        # Return only the last output frame
        output = self.conv(output[:, :, -1])

        return nn.Sigmoid()(output)
    
    def training_step(self, batch, batch_idx):
        X, y = batch
        y_hat = self(X)
        loss = F.mse_loss(y_hat, y)
        self.log("train_loss", loss)
        return loss
    
    def validation_step(self, batch, batch_idx):
        X, y = batch
        y_hat = self(X)
        loss = F.mse_loss(y_hat, y)
        self.log("val_loss", loss)
        return loss

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=1e-3)    


# Training

In [46]:
trainer = pl.Trainer(
    accelerator='gpu',
    devices=1,
    max_epochs=2,
)
model = Seq2Seq()
data_module = DataModule(data_dir='/kaggle/input/openai-calibration-challenge/labeled_chunks')
trainer.fit(model, data_module)

NameError: name 'EarlyStopping' is not defined