Python Implementation: Action Recognition with Pretrained R(2+1)D-18 Model

In [1]:
# Install dependencies:
# pip install torch torchvision opencv-python

import torch
import torchvision
import torchvision.transforms as T
import cv2
import numpy as np
import os

In [2]:
# Load pretrained action recognition model
model = torchvision.models.video.r3d_18(pretrained=True)
model.eval()


Downloading: "https://download.pytorch.org/models/r3d_18-b3b3357e.pth" to /root/.cache/torch/hub/checkpoints/r3d_18-b3b3357e.pth
100%|██████████| 127M/127M [00:01<00:00, 125MB/s]


VideoResNet(
  (stem): BasicStem(
    (0): Conv3d(3, 64, kernel_size=(3, 7, 7), stride=(1, 2, 2), padding=(1, 3, 3), bias=False)
    (1): BatchNorm3d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU(inplace=True)
  )
  (layer1): Sequential(
    (0): BasicBlock(
      (conv1): Sequential(
        (0): Conv3DSimple(64, 64, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1), bias=False)
        (1): BatchNorm3d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (2): ReLU(inplace=True)
      )
      (conv2): Sequential(
        (0): Conv3DSimple(64, 64, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1), bias=False)
        (1): BatchNorm3d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
      (relu): ReLU(inplace=True)
    )
    (1): BasicBlock(
      (conv1): Sequential(
        (0): Conv3DSimple(64, 64, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1), bias=False)
        (1):

In [3]:
# Class labels for Kinetics-400 (partial demo, real list has 400 classes)
kinetics_classes = ["abseiling", "air drumming", "answering questions", "applauding", "applying cream",
                    "archery", "arm wrestling", "arranging flowers", "assembling computer", "auctioning"]

In [4]:
# Video loader and preprocessor
def load_video_frames(path, num_frames=16, size=(112, 112)):
    cap = cv2.VideoCapture(path)
    frames = []
    total = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    step = max(total // num_frames, 1)

    for i in range(num_frames):
        cap.set(cv2.CAP_PROP_POS_FRAMES, i * step)
        ret, frame = cap.read()
        if not ret:
            break
        frame = cv2.resize(frame, size)
        frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        frames.append(frame)
    cap.release()

    # Transform and normalize
    transform = T.Compose([
        T.ToTensor(),
        T.Normalize(mean=[0.43216, 0.394666, 0.37645],
                    std=[0.22803, 0.22145, 0.216989])
    ])
    frames = [transform(frame) for frame in frames]
    video = torch.stack(frames).permute(1, 0, 2, 3)  # [C, T, H, W]
    return video.unsqueeze(0)  # [1, C, T, H, W]


In [5]:
# Path to your video file
video_path = "sample_action.mp4"  # Replace with real video path
video_tensor = load_video_frames(video_path)

In [6]:
# Predict action
with torch.no_grad():
    outputs = model(video_tensor)
    probs = torch.nn.functional.softmax(outputs[0], dim=0)
    top5 = torch.topk(probs, k=5)

In [8]:
print("🎬 Top 5 Predicted Actions:")
for idx in top5.indices:
    if idx < len(kinetics_classes) and idx < len(probs):
        print(f"{kinetics_classes[idx]} ({probs[idx]*100:.2f}%)")
    else:
        print(f"Index {idx} is out of range.")


🎬 Top 5 Predicted Actions:
Index 49 is out of range.
Index 45 is out of range.
Index 207 is out of range.
Index 153 is out of range.
Index 107 is out of range.
