In [2]:
pip install lightning torch

Note: you may need to restart the kernel to use updated packages.


In [3]:
pip install av torchvision

Collecting av
  Downloading av-14.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.6 kB)
Downloading av-14.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (38.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m38.8/38.8 MB[0m [31m48.5 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hInstalling collected packages: av
Successfully installed av-14.2.0
Note: you may need to restart the kernel to use updated packages.


In [4]:
import os
import av
import torch
import numpy as np
import pandas as pd
import torch.nn as nn
import torch.optim as optim
from torch.optim import Adam
import pytorch_lightning as pl
import torch.nn.functional as F
from pytorch_lightning import Trainer
import torchvision.models.video as models
import torchvision.transforms as transforms
from sklearn.preprocessing import OneHotEncoder
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from pytorch_lightning.callbacks import ModelCheckpoint

In [5]:
root_dir = "/kaggle/input/word-signs-processed/processed_videos"

data = []

for label in os.listdir(root_dir):
    label_path = os.path.join(root_dir, label)
    if os.path.isdir(label_path):
        for video_file in os.listdir(label_path):
            if video_file.endswith(".mp4"):
                file_path = os.path.join(label_path, video_file)
                data.append((file_path, label))

df = pd.DataFrame(data, columns=["file_path", "label"])
df

Unnamed: 0,file_path,label
0,/kaggle/input/word-signs-processed/processed_v...,no
1,/kaggle/input/word-signs-processed/processed_v...,no
2,/kaggle/input/word-signs-processed/processed_v...,no
3,/kaggle/input/word-signs-processed/processed_v...,no
4,/kaggle/input/word-signs-processed/processed_v...,no
...,...,...
495,/kaggle/input/word-signs-processed/processed_v...,help
496,/kaggle/input/word-signs-processed/processed_v...,help
497,/kaggle/input/word-signs-processed/processed_v...,help
498,/kaggle/input/word-signs-processed/processed_v...,help


In [6]:
encoder = OneHotEncoder(sparse_output=False)
ohe_labels = encoder.fit_transform(df[['label']])
df['ohe_label'] = list(ohe_labels)

In [7]:
df.drop(["label"], axis = 1, inplace = True)
df.rename(columns={'ohe_label': 'label'}, inplace=True)

In [8]:
df

Unnamed: 0,file_path,label
0,/kaggle/input/word-signs-processed/processed_v...,"[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,/kaggle/input/word-signs-processed/processed_v...,"[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,/kaggle/input/word-signs-processed/processed_v...,"[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,/kaggle/input/word-signs-processed/processed_v...,"[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,/kaggle/input/word-signs-processed/processed_v...,"[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
...,...,...
495,/kaggle/input/word-signs-processed/processed_v...,"[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
496,/kaggle/input/word-signs-processed/processed_v...,"[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
497,/kaggle/input/word-signs-processed/processed_v...,"[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
498,/kaggle/input/word-signs-processed/processed_v...,"[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [9]:
train_df, tesval_df = train_test_split(df, test_size = 0.25, random_state = 42)
val_df, test_df = train_test_split(tesval_df, test_size = (1 - (0.1/0.25)), random_state = 42)

# Reset index
train_df = train_df.reset_index(drop=True)
val_df = val_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)

In [28]:
val_df

Unnamed: 0,file_path,label
0,/kaggle/input/word-signs-processed/processed_v...,"[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ..."
1,/kaggle/input/word-signs-processed/processed_v...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, ..."
2,/kaggle/input/word-signs-processed/processed_v...,"[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,/kaggle/input/word-signs-processed/processed_v...,"[0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, ..."
4,/kaggle/input/word-signs-processed/processed_v...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, ..."
5,/kaggle/input/word-signs-processed/processed_v...,"[0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
6,/kaggle/input/word-signs-processed/processed_v...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, ..."
7,/kaggle/input/word-signs-processed/processed_v...,"[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
8,/kaggle/input/word-signs-processed/processed_v...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, ..."
9,/kaggle/input/word-signs-processed/processed_v...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, ..."


In [11]:
print(dir(models))

['MC3_18_Weights', 'MViT', 'MViT_V1_B_Weights', 'MViT_V2_S_Weights', 'R2Plus1D_18_Weights', 'R3D_18_Weights', 'S3D', 'S3D_Weights', 'Swin3D_B_Weights', 'Swin3D_S_Weights', 'Swin3D_T_Weights', 'SwinTransformer3d', 'VideoResNet', '__builtins__', '__cached__', '__doc__', '__file__', '__loader__', '__name__', '__package__', '__path__', '__spec__', 'mc3_18', 'mvit', 'mvit_v1_b', 'mvit_v2_s', 'r2plus1d_18', 'r3d_18', 'resnet', 's3d', 'swin3d_b', 'swin3d_s', 'swin3d_t', 'swin_transformer']


In [12]:
class SignDataset(Dataset):
    def __init__(self, dataframe):
        self.data = dataframe
        self.transform = transforms.Compose([
            transforms.ToTensor(),
        ])

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        video_path = self.data.iloc[idx]["file_path"]
        label = self.data.iloc[idx]["label"]
        container = av.open(video_path)
        frames = [frame.to_image() for frame in container.decode(video=0)]
        container.close()
        
        frames = torch.stack([self.transform(frame) for frame in frames])
        frames = frames.permute(1, 0, 2, 3)
        
        return frames, label

train_loader = DataLoader(SignDataset(train_df), batch_size=8, shuffle=True)
val_loader = DataLoader(SignDataset(val_df), batch_size=8, shuffle=False)
test_loader = DataLoader(SignDataset(test_df), batch_size=8, shuffle=False)

In [13]:
for vids in train_loader:
    print(vids[0].shape)
    print(len(vids[1]))
    break

torch.Size([8, 3, 16, 224, 224])
8


In [43]:
class SignLanguageModel(pl.LightningModule):
    def __init__(self, num_classes):
        super().__init__()
        self.model = models.video.mc3_18(weights="DEFAULT")
        self.model.fc = nn.Sequential(
            nn.Dropout(0.5),
            nn.Linear(self.model.fc.in_features, num_classes)
        )
        self.criterion = nn.CrossEntropyLoss()

    def forward(self, x):
        return self.model(x)

    def training_step(self, batch, batch_idx):
        videos, labels = batch
        labels = labels.argmax(dim=1).to(torch.long)
        preds = self(videos)
        loss = self.criterion(preds, labels)
        self.log("train_loss", loss, prog_bar=True)
        return loss

    def validation_step(self, batch, batch_idx):
        videos, labels = batch
        labels = labels.argmax(dim=1).to(torch.long)
        preds = self(videos)
        loss = self.criterion(preds, labels)
        acc = (preds.argmax(dim=1) == labels).float().mean()
        
        self.log("val_loss", loss, prog_bar=True)
        self.log("val_acc", acc, prog_bar=True)

        return loss

    def test_step(self, batch, batch_idx):
        videos, labels = batch
        labels = labels.argmax(dim=1).to(torch.long)
        preds = self(videos)
        loss = self.criterion(preds, labels)
        acc = (preds.argmax(dim=1) == labels).float().mean()
        
        self.log("test_loss", loss, prog_bar=True)
        self.log("test_acc", acc, prog_bar=True)

        return loss

    def configure_optimizers(self):
        return optim.Adam(self.parameters(), lr=1e-4, weight_decay=5e-4)

In [44]:
trainer = Trainer(
    accelerator="gpu", 
    devices=1, 
    max_epochs=10, 
    callbacks=[
        ModelCheckpoint(dirpath="checkpoints/", filename="best_model", save_top_k=1, monitor="val_loss", mode="min")
    ]
)

sign_model = SignLanguageModel(num_classes)

trainer.fit(sign_model, train_dataloaders=train_loader)
trainer.validate(sign_model, dataloaders = val_loader)
trainer.test(sign_model, dataloaders = test_loader)

Downloading: "https://download.pytorch.org/models/mc3_18-a90a0ba3.pth" to /root/.cache/torch/hub/checkpoints/mc3_18-a90a0ba3.pth

  0%|          | 0.00/44.7M [00:00<?, ?B/s][A
 29%|██▉       | 13.1M/44.7M [00:00<00:00, 137MB/s][A
100%|██████████| 44.7M/44.7M [00:00<00:00, 176MB/s][A


Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Testing: |          | 0/? [00:00<?, ?it/s]

[{'test_loss': 0.030088482424616814, 'test_acc': 1.0}]

In [33]:
for _, labels in val_loader:
    print(f"Labels shape: {labels.shape}, dtype: {labels.dtype}")
    break

Labels shape: torch.Size([8, 10]), dtype: torch.float64


In [34]:
trainer.validate(sign_model, dataloaders = val_loader)

/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/connectors/data_connector.py:425: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=3` in the `DataLoader` to improve performance.


Validation: |          | 0/? [00:00<?, ?it/s]

[{'val_loss': 0.22239582240581512, 'val_acc': 0.9399999976158142}]

<IPython.core.display.Javascript object>