In [1]:
import torch
from torchvision import transforms
from torchvision.io import read_video
from PIL import Image
import os
import cv2
import numpy as np
from torchvision.models.video import r3d_18
import torchvision

In [2]:
NUM_FRAMES = 16
INPUT_SIZE = 112

In [3]:
def prepare_video_tensor(keyframes, num_frames=NUM_FRAMES, input_size=INPUT_SIZE):
    frames = []
    for frame in keyframes:
        image_path = os.path.join(KEYFRAMES_DIR, frame)
        image = Image.open(image_path).convert('RGB')
        transform = transforms.Compose([
            transforms.Resize((input_size, input_size)),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.43216, 0.394666, 0.37645],
                                 std=[0.22803, 0.22145, 0.216989]),
        ])
        tensor = transform(image)
        frames.append(tensor)
    
    # Nếu số khung hình ít hơn num_frames, lặp lại hoặc padding
    if len(frames) < num_frames:
        for _ in range(num_frames - len(frames)):
            frames.append(frames[-1])  # Lặp lại khung hình cuối cùng
    else:
        frames = frames[:num_frames]
    
    # Chuyển đổi thành tensor video: [C, T, H, W]
    video = torch.stack(frames)  # [T, C, H, W]
    video = video.permute(1, 0, 2, 3)  # [C, T, H, W]
    video = video.unsqueeze(0)  # [1, C, T, H, W]
    return video

In [4]:
model = r3d_18(pretrained=True)
model.eval()

Downloading: "https://download.pytorch.org/models/r3d_18-b3b3357e.pth" to /Users/VoThinhPhat/.cache/torch/hub/checkpoints/r3d_18-b3b3357e.pth
100%|██████████| 127M/127M [02:00<00:00, 1.11MB/s] 


VideoResNet(
  (stem): BasicStem(
    (0): Conv3d(3, 64, kernel_size=(3, 7, 7), stride=(1, 2, 2), padding=(1, 3, 3), bias=False)
    (1): BatchNorm3d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU(inplace=True)
  )
  (layer1): Sequential(
    (0): BasicBlock(
      (conv1): Sequential(
        (0): Conv3DSimple(64, 64, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1), bias=False)
        (1): BatchNorm3d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (2): ReLU(inplace=True)
      )
      (conv2): Sequential(
        (0): Conv3DSimple(64, 64, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1), bias=False)
        (1): BatchNorm3d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
      (relu): ReLU(inplace=True)
    )
    (1): BasicBlock(
      (conv1): Sequential(
        (0): Conv3DSimple(64, 64, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1), bias=False)
        (1):

In [11]:
device = 'cpu'
model.to(device)

VideoResNet(
  (stem): BasicStem(
    (0): Conv3d(3, 64, kernel_size=(3, 7, 7), stride=(1, 2, 2), padding=(1, 3, 3), bias=False)
    (1): BatchNorm3d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU(inplace=True)
  )
  (layer1): Sequential(
    (0): BasicBlock(
      (conv1): Sequential(
        (0): Conv3DSimple(64, 64, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1), bias=False)
        (1): BatchNorm3d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (2): ReLU(inplace=True)
      )
      (conv2): Sequential(
        (0): Conv3DSimple(64, 64, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1), bias=False)
        (1): BatchNorm3d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
      (relu): ReLU(inplace=True)
    )
    (1): BasicBlock(
      (conv1): Sequential(
        (0): Conv3DSimple(64, 64, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1), bias=False)
        (1):

In [6]:
def detect_action(video_tensor):
    video_tensor = video_tensor.to(device)
    with torch.no_grad():
        outputs = model(video_tensor)
    logits = outputs  # [1, num_classes]
    predicted_class_idx = logits.argmax(-1).item()
    # Lấy tên lớp từ danh sách lớp của Kinetics
    # Bạn cần tải danh sách lớp tương ứng với mô hình tiền huấn luyện
    # Ví dụ, sử dụng file kinetics_labels.txt chứa danh sách 400 lớp
    with open('kinetics_labels.txt', 'r') as f:
        labels = [line.strip() for line in f.readlines()]
    predicted_label = labels[predicted_class_idx]
    return predicted_label

In [9]:
KEYFRAMES_DIR = '/Users/VoThinhPhat/Desktop/data/batch1/keyframes/keyframes_L01/L01_V001/'

# Danh sách tên file keyframes theo thứ tự thời gian
keyframes = ['07791.jpg', '07805.jpg', '07818.jpg', '07846.jpg']

video_tensor = prepare_video_tensor(keyframes)

In [13]:
action = detect_action(video_tensor)
print(f"Hành động được phát hiện: {action}")

Hành động được phát hiện: skiing (not slalom or crosscountry)
