In [21]:
from torch.utils.data import DataLoader, Dataset
import pandas as pd
import cv2
from torchvision.tv_tensors import Video, Image
from torchvision.ops import Conv2dNormActivation, DeformConv2d
import os
import torch
# from torchvision.datasets.video_utils import read_video

In [22]:
# Constants
NUM_CLASSES = 100
TARGET_FRAMES = 16  # number of frames per video

In [23]:
# Read video frames using OpenCV
def read_video(video_path):
    cap = cv2.VideoCapture(video_path)
    frames = []
    while True:
        ret, frame = cap.read()
        if not ret:
            break
        frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        frames.append(frame)
    cap.release()
    if len(frames) == 0:
        raise ValueError(f"Could not read any frames from {video_path}")
    frames = torch.from_numpy(np.stack(frames, axis=0))
    return frames


# Custom collate function for batching
def collate_fn(batch):
    frames = torch.stack([item['frames'] for item in batch])
    labels = torch.tensor([item['label_idx'] for item in batch])
    label_names = [item['label'] for item in batch]
    return {'frames': frames, 'label_idx': labels, 'label': label_names}

In [24]:
import os
import cv2
import torch
import pandas as pd
from torch.utils.data import Dataset
import numpy as np
import pickle
import unicodedata

# Define video dataset
class VideoDataset(Dataset):
    def __init__(self, root_dir, label_to_idx_path, transform=None,
                 mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225],
                 target_frames=32):
        self.root_dir = root_dir
        self.transform = transform
        self.mean, self.std = mean, std
        self.target_frames = target_frames
        self.instances, self.labels, self.label_idx = [], [], []

        with open(label_to_idx_path, 'rb') as f:
            raw_mapping = pickle.load(f)
            # Chuẩn hóa key trong mapping về NFC
            self.label_mapping = {unicodedata.normalize('NFC', k): v for k, v in raw_mapping.items()}

        # Lấy danh sách folder và chuẩn hóa tên để so sánh
        folder_names = sorted(os.listdir(root_dir))[:NUM_CLASSES]
        
        for label_folder in folder_names:
            # Chuẩn hóa tên folder về NFC
            norm_label = unicodedata.normalize('NFC', label_folder)
            
            path = os.path.join(root_dir, label_folder)
            if os.path.isdir(path):
                if norm_label not in self.label_mapping:
                    print(f"Bỏ qua: '{label_folder}' (Normalized: '{norm_label}') không có trong mapping")
                    continue

                idx = self.label_mapping[norm_label]
                
                for video_file in os.listdir(path):
                    video_path = os.path.join(path, video_file)
                    self.instances.append(video_path)
                    self.labels.append(norm_label)
                    self.label_idx.append(idx)

    # Downsample frames to fixed length
    def _downsample_frames(self, frames):
        num_frames = frames.shape[0]
        if num_frames == self.target_frames:
            return frames
        elif num_frames < self.target_frames:
            pad = self.target_frames - num_frames
            return torch.cat([frames, frames[-1:].repeat(pad, 1, 1, 1)], dim=0)
        else:
            idx = torch.linspace(0, num_frames - 1, self.target_frames).long()
            return frames[idx]

    # Normalize frames with ImageNet stats
    def _normalize(self, frames):
        frames = frames.permute(0, 3, 1, 2).float() / 255.0
        mean = torch.tensor(self.mean).view(1, 3, 1, 1)
        std = torch.tensor(self.std).view(1, 3, 1, 1)
        return (frames - mean) / std

    def __len__(self):
        return len(self.instances)

    def __getitem__(self, idx):
        video_path = self.instances[idx]
        label, label_idx = self.labels[idx], self.label_idx[idx]
        frames = read_video(video_path)
        frames = self._downsample_frames(frames)
        frames = self._normalize(frames)
        return {"frames": frames, "label_idx": label_idx, "label": label}

In [25]:
# Debug: kiểm tra mapping vs folders
with open("/kaggle/input/cv-vng-min-bc/dataset/label_mapping.pkl", 'rb') as f:
    label_mapping = pickle.load(f)

folders = sorted(os.listdir("/kaggle/input/cv-vng-min-bc/dataset/train"))[:NUM_CLASSES]
print("Folders:", folders[:5])
print("Mapping keys:", list(label_mapping.keys())[:5])
print("Missing in mapping:", [f for f in folders if f not in label_mapping])

Folders: ['An ủi', 'Ban ngày', 'Ban đêm', 'Biết', 'Biếu tặng']
Mapping keys: ['An ủi', 'Áp dụng', 'Ăn', 'Ăn mừng', 'Ban ngày']
Missing in mapping: ['An ủi', 'Ban ngày', 'Ban đêm', 'Biết', 'Biếu tặng', 'Bàn tay', 'Băn khoăn', 'Bạn thân', 'Bế mạc', 'Bệnh nhân', 'Bệnh viện', 'Bộ y tế', 'Chiều', 'Chào', 'Chân', 'Chúng ta', 'Chạy', 'Chấp nhận', 'Chậm lại', 'Con gấu', 'Cá', 'Cách ly', 'Cám dỗ', 'Có thể', 'Cơ thể', 'Cảm ơn', 'Cần', 'Cứu', 'Dạy dỗ', 'Dễ', 'Ghét', 'Giúp', 'Hâm mộ', 'Hôm nay', 'Họ', 'Học sinh', 'Khai báo', 'Khu cách ly', 'Khóc', 'Khẩu trang', 'Kết hôn', 'Lo lắng', 'Lây bệnh', 'Mời vào', 'Nghỉ ngơi', 'Ngón tay', 'Nhà', 'Nhìn', 'Nhầm', 'Nhớ', 'Nói', 'Nói xấu', 'Nôn ói', 'Nặng', 'Phía sau', 'Phạt', 'Phỏng vấn', 'Phục hồi', 'Rẽ phải', 'Rẽ trái', 'San sẻ', 'Sốt', 'Sử dụng', 'Thích', 'Thăm', 'Thương', 'Thất lạc', 'Thức dậy', 'Thức ăn', 'Trưa', 'Trường học', 'Tôi', 'Tập luyện', 'Tối', 'Uống', 'Vâng lời', 'Xe máy', 'Xe đạp', 'Xin lỗi', 'Xin phép', 'Xuất viện', 'Xúc động', 'Áp dụ

# Video Classification TorchVision

In [26]:
from torchvision.models.video import mvit_v1_b, MViT_V1_B_Weights
model = mvit_v1_b(weights=None)

In [27]:
model.head = torch.nn.Linear(in_features=768, out_features=100, bias=True)

In [28]:
dataset = VideoDataset(root_dir="/kaggle/input/cv-vng-min-bc/dataset/train", transform=MViT_V1_B_Weights.DEFAULT.transforms(), label_to_idx_path="/kaggle/input/cv-vng-min-bc/dataset/label_mapping.pkl", target_frames=TARGET_FRAMES)

In [29]:
from torch.utils.data.sampler import WeightedRandomSampler
from torch.utils.data import random_split
import pytorch_lightning as pl
pl.seed_everything(42, workers=True)
train_size = int(len(dataset) * 0.8)
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])


Seed set to 42


In [30]:
classes_indices = [ele for ele in train_dataset.dataset.label_idx if ele in train_dataset.indices]
print(len(classes_indices))
print(len(dataset))
import numpy as np
class_weights = 1. / np.bincount(classes_indices)
samples_weights = [class_weights[ele] for ele in classes_indices]
sampler = WeightedRandomSampler(weights=samples_weights, num_samples=len(samples_weights), replacement=True)
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=False, num_workers=2, collate_fn=collate_fn, sampler=sampler)
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False, num_workers=2, collate_fn=collate_fn)

2814
3875


  class_weights = 1. / np.bincount(classes_indices)


In [31]:
print(model)

MViT(
  (conv_proj): Conv3d(3, 96, kernel_size=(3, 7, 7), stride=(2, 4, 4), padding=(1, 3, 3))
  (pos_encoding): PositionalEncoding()
  (blocks): ModuleList(
    (0): MultiscaleBlock(
      (norm1): LayerNorm((96,), eps=1e-06, elementwise_affine=True)
      (norm2): LayerNorm((96,), eps=1e-06, elementwise_affine=True)
      (attn): MultiscaleAttention(
        (qkv): Linear(in_features=96, out_features=288, bias=True)
        (project): Sequential(
          (0): Linear(in_features=96, out_features=96, bias=True)
        )
        (pool_k): Pool(
          (pool): Conv3d(96, 96, kernel_size=(3, 3, 3), stride=(1, 8, 8), padding=(1, 1, 1), groups=96, bias=False)
          (norm_act): Sequential(
            (0): LayerNorm((96,), eps=1e-06, elementwise_affine=True)
          )
        )
        (pool_v): Pool(
          (pool): Conv3d(96, 96, kernel_size=(3, 3, 3), stride=(1, 8, 8), padding=(1, 1, 1), groups=96, bias=False)
          (norm_act): Sequential(
            (0): LayerNorm((96,

In [32]:
!pip show pytorch-lightning

Name: pytorch-lightning
Version: 2.5.5
Summary: PyTorch Lightning is the lightweight PyTorch wrapper for ML researchers. Scale your models. Write less boilerplate.
Home-page: https://github.com/Lightning-AI/lightning
Author: Lightning AI et al.
Author-email: developer@lightning.ai
License: Apache-2.0
Location: /usr/local/lib/python3.11/dist-packages
Requires: fsspec, lightning-utilities, packaging, PyYAML, torch, torchmetrics, tqdm, typing-extensions
Required-by: 


In [33]:
dataset[0]['frames'].shape

torch.Size([16, 3, 224, 224])

In [34]:
from pytorch_lightning import LightningModule, Trainer
from torch.optim import Adam
from torch.nn import CrossEntropyLoss
from torchmetrics import F1Score
F1Score = F1Score(num_classes=NUM_CLASSES, average='macro', task='multiclass')
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping
from pytorch_lightning.loggers import TensorBoardLogger
from torch.optim.lr_scheduler import ReduceLROnPlateau

class WrapperModel(LightningModule):
    def __init__(self, lr=1e-2, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.model = model
        self.criterion = CrossEntropyLoss()
        self.f1 = F1Score
        self.lr = lr
    
    def forward(self, x):
        return self.model(x)
    def training_step(self, batch, batch_idx):
        frames, labels = torch.permute(batch['frames'], (0, 2, 1, 3, 4)), batch['label_idx']
        outputs = self(frames)
        loss = self.criterion(outputs, labels)
        f1 = self.f1(outputs, labels)
        self.log('train_f1', f1, prog_bar=True, on_step=False, on_epoch=True)
        self.log('train_loss', loss, prog_bar=True, on_step=True, on_epoch=False)
        return loss
    def validation_step(self, batch, batch_idx):
        frames, labels = torch.permute(batch['frames'], (0, 2, 1, 3, 4)), batch['label_idx']
        outputs = self(frames)
        loss = self.criterion(outputs, labels)
        f1 = self.f1(outputs, labels)
        self.log('val_f1', f1, prog_bar=True, on_step=False, on_epoch=True)
        self.log('val_loss', loss, prog_bar=True, on_step=True, on_epoch=False)
        self.log("fitness", loss*0.6 + f1*0.4, prog_bar=True, on_step=False, on_epoch=True)
        return loss
    def configure_optimizers(self):
        optimizer = Adam(self.parameters(), self.lr)
        lr_scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=3, verbose=True)
        return {"optimizer": optimizer, "lr_scheduler": lr_scheduler, "monitor": "fitness"}

model_wrapper = WrapperModel(lr=1e-3)
trainer = Trainer(max_epochs=20, accelerator='gpu', devices=2,
                  callbacks=[ModelCheckpoint(monitor='fitness', mode='min', save_top_k=1),
                             EarlyStopping(monitor='fitness', mode='min', patience=5)], max_time="00:01:00:00",
                  logger=TensorBoardLogger("tb_logs", name="video_classification"),gradient_clip_val=1.0, accumulate_grad_batches=4)
    
# trainer.fit(model_wrapper, train_loader, val_loader)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


In [35]:
from torchvision.models import ResNet152_Weights, resnet152
import torch
resnet = resnet152(weights=ResNet152_Weights.IMAGENET1K_V1)
dummy_input = torch.randn(1, 3, 224, 224)
hola = torch.nn.Sequential(*list(resnet.children())[:-1])(dummy_input)

In [36]:
print(hola.shape)

torch.Size([1, 2048, 1, 1])


In [37]:
# Define CRNN model
import torch.nn as nn

class CRNN(nn.Module):
    def __init__(self, num_classes=100, hidden_size=256):
        super(CRNN, self).__init__()
        self.cnn = nn.Sequential(*list(resnet.children())[:-1])
        # self.feature_dim = 2048
        # self.pool = nn.AdaptiveAvgPool2d((1, 1))
        transformers_layer = nn.TransformerEncoderLayer(d_model=2048, nhead=8, batch_first=True, activation='gelu', dropout=0.2)
        self.transformer = nn.TransformerEncoder(transformers_layer, num_layers=2, norm=nn.LayerNorm(2048))
        # self.rnn = nn.LSTM(self.feature_dim, hidden_size, batch_first=True, dropout=0.3)
        self.fc = nn.Linear(2048, num_classes)

    def forward(self, x: torch.Tensor):
        B, T, C, H, W = x.shape
        x = x.reshape(B * T, C, H, W)
        # print(x.shape)
        with torch.no_grad():
            features = self.cnn(x)
        # print(features.shape)
        # pooled = self.pool(features).squeeze(-1).squeeze(-1)
        seq = features.reshape(B, T, 2048)
        # print(seq.shape)
        res = self.transformer(seq)
        # print("transformers res", res.shape)
        final = res.mean(dim=1)
        # print("final", final.shape)
        return self.fc(final)


In [38]:
# Debug model
# del model
model = CRNN()
dummy_input = torch.randn(2, 16, 3, 224, 224)
output = model(dummy_input)
print(output.shape)

torch.Size([2, 100])


In [39]:
from pytorch_lightning import LightningModule, Trainer
from torch.optim import Adam
from torch.nn import CrossEntropyLoss
from torchmetrics import F1Score
F1Score = F1Score(num_classes=NUM_CLASSES, average='macro', task='multiclass')
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping, StochasticWeightAveraging
from pytorch_lightning.loggers import TensorBoardLogger
from torch.optim.lr_scheduler import ReduceLROnPlateau

class WrapperModel(LightningModule):
    def __init__(self, lr=1e-2, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.cnn = nn.Sequential(*list(resnet.children())[:-1])
        # self.feature_dim = 2048
        # self.pool = nn.AdaptiveAvgPool2d((1, 1))
        transformers_layer = nn.TransformerEncoderLayer(d_model=2048, nhead=8, batch_first=True, activation='gelu', dropout=0.2)
        self.transformer = nn.TransformerEncoder(transformers_layer, num_layers=2, norm=nn.LayerNorm(2048))
        # self.rnn = nn.LSTM(self.feature_dim, hidden_size, batch_first=True, dropout=0.3)
        self.fc = nn.Linear(2048, 100)
        self.criterion = CrossEntropyLoss()
        self.f1 = F1Score
        self.lr = lr
    
    def forward(self, x):
        B, T, C, H, W = x.shape
        # print(x.shape)
        x = x.reshape(B * T, C, H, W)
        
        with torch.no_grad():
            features = self.cnn(x)
        # print(features.shape)
        # pooled = self.pool(features).squeeze(-1).squeeze(-1)
        seq = features.reshape(B, T, 2048)
        # print(seq.shape)
        res = self.transformer(seq)
        # print("transformers res", res.shape)
        final = res.mean(dim=1)
        # print("final", final.shape)
        return self.fc(final)
    
    def training_step(self, batch, batch_idx):
        frames, labels = batch['frames'], batch['label_idx']
        outputs = self(frames)
        loss = self.criterion(outputs, labels)
        f1 = self.f1(outputs, labels)
        self.log('train_f1', f1, prog_bar=True, on_step=True)
        self.log('train_loss', loss, prog_bar=True, on_step=True)
        return loss
    def validation_step(self, batch, batch_idx):
        frames, labels = batch['frames'], batch['label_idx']
        outputs = self(frames)
        loss = self.criterion(outputs, labels)
        f1 = self.f1(outputs, labels)
        self.log('val_f1', f1, prog_bar=True, on_step=False, on_epoch=True)
        self.log('val_loss', loss, prog_bar=True, on_step=True, on_epoch=False)
        self.log("fitness", loss*0.6 + f1*0.4, prog_bar=True, on_step=False, on_epoch=True)
        return loss
    def configure_optimizers(self):
        optimizer = Adam(self.parameters(), self.lr)
        lr_scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=3, verbose=True)
        return {"optimizer": optimizer, "lr_scheduler": lr_scheduler, "monitor": "fitness"}

model_wrapper = WrapperModel(lr=1e-3)
trainer = Trainer(max_epochs=20, accelerator='gpu', devices=2,
                  callbacks=[ModelCheckpoint(monitor='fitness', mode='min', save_top_k=1),
                             EarlyStopping(monitor='fitness', mode='min', patience=5), StochasticWeightAveraging(1e-6)], max_time="00:01:00:00",
                  logger=TensorBoardLogger("tb_logs", name="crnn_transformers"),gradient_clip_val=1.0, accumulate_grad_batches=4, profiler="simple", )
    
trainer.fit(model_wrapper, train_loader, val_loader)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
Initializing distributed: GLOBAL_RANK: 0, MEMBER: 1/2
W1204 10:53:51.152000 2659 torch/multiprocessing/spawn.py:169] Terminating process 2818 via signal SIGTERM


ProcessRaisedException: 

-- Process 0 terminated with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/torch/multiprocessing/spawn.py", line 90, in _wrap
    fn(i, *args)
  File "/usr/local/lib/python3.11/dist-packages/pytorch_lightning/strategies/launchers/multiprocessing.py", line 173, in _wrapping_function
    results = function(*args, **kwargs)
              ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/pytorch_lightning/trainer/trainer.py", line 598, in _fit_impl
    self._run(model, ckpt_path=ckpt_path)
  File "/usr/local/lib/python3.11/dist-packages/pytorch_lightning/trainer/trainer.py", line 967, in _run
    self.strategy.setup_environment()
  File "/usr/local/lib/python3.11/dist-packages/pytorch_lightning/strategies/ddp.py", line 154, in setup_environment
    self.setup_distributed()
  File "/usr/local/lib/python3.11/dist-packages/pytorch_lightning/strategies/ddp.py", line 206, in setup_distributed
    _init_dist_connection(self.cluster_environment, self._process_group_backend, **kwargs)
  File "/usr/local/lib/python3.11/dist-packages/lightning_fabric/utilities/distributed.py", line 298, in _init_dist_connection
    torch.distributed.init_process_group(torch_distributed_backend, rank=global_rank, world_size=world_size, **kwargs)
  File "/usr/local/lib/python3.11/dist-packages/torch/distributed/c10d_logger.py", line 81, in wrapper
    return func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/torch/distributed/c10d_logger.py", line 95, in wrapper
    func_return = func(*args, **kwargs)
                  ^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/torch/distributed/distributed_c10d.py", line 1714, in init_process_group
    store, rank, world_size = next(rendezvous_iterator)
                              ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/torch/distributed/rendezvous.py", line 274, in _env_rendezvous_handler
    store = _create_c10d_store(
            ^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/torch/distributed/rendezvous.py", line 194, in _create_c10d_store
    return TCPStore(
           ^^^^^^^^^
RuntimeError: The server socket has failed to listen on any local network address. port: 40489, useIpv6: 0, code: -98, name: EADDRINUSE, message: address already in use


In [None]:
encoder_layer = nn.TransformerEncoderLayer(d_model=512, nhead=8)
transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=6)
src = torch.rand(10, 32, 512)
lstm = nn.LSTM(512, 512, batch_first=True, dropout=0.3)
out = lstm(src)
out[:,-1,:].shape

In [None]:
help(torch.squeeze)