In [1]:
import os
import sys
import torch
import pytorchvideo

import json 
from torchvision.transforms import Compose, Lambda
from torchvision.transforms._transforms_video import (
    CenterCropVideo,
    NormalizeVideo,
)
from pytorchvideo.data.encoded_video import EncodedVideo
from pytorchvideo.transforms import (
    ApplyTransformToKey,
    ShortSideScale,
    UniformTemporalSubsample,
    UniformCropVideo
) 
from typing import Dict



Setup  
Download the id to label mapping for the Kinetics 400 dataset on which the Torch Hub models were trained. This will be used to get the category label names from the predicted class ids.

In [2]:
# !wget https://dl.fbaipublicfiles.com/pyslowfast/dataset/class_names/kinetics_classnames.json

In [3]:
with open("kinetics_classnames.json", "r") as f:
    kinetics_classnames = json.load(f)

# Create an id to label name mapping
kinetics_id_to_classname = {}
for k, v in kinetics_classnames.items():
    kinetics_id_to_classname[v] = str(k).replace('"', "")

Load Model using Torch Hub API  
PyTorchVideo provides several pretrained models through Torch Hub. Available models are described in model zoo documentation.  

Here we are selecting the slowfast_r50 model which was trained using a 8x8 setting on the Kinetics 400 dataset.  

NOTE: to run on GPU in Google Colab, in the menu bar selet: Runtime -> Change runtime type -> Harware Accelerator -> GPU  

In [4]:
# Device on which to run the model
# Set to cuda to load on GPU
device = "cuda"

# Pick a pretrained model 
model_name = "slowfast_r50"
model = torch.hub.load("facebookresearch/pytorchvideo:main", model=model_name, pretrained=True)

# Set to eval mode and move to desired device
model = model.to(device)
model = model.eval()

Using cache found in /home/tim/.cache/torch/hub/facebookresearch_pytorchvideo_main


In [5]:
####################
# SlowFast transform
####################

side_size = 256
mean = [0.45, 0.45, 0.45]
std = [0.225, 0.225, 0.225]
crop_size = 256
num_frames = 32
sampling_rate = 16
frames_per_second = 12
alpha = 4

class PackPathway(torch.nn.Module):
    """
    Transform for converting video frames as a list of tensors. 
    """
    def __init__(self):
        super().__init__()
        
    def forward(self, frames: torch.Tensor):
        fast_pathway = frames
        # Perform temporal sampling from the fast pathway.
        slow_pathway = torch.index_select(
            frames,
            1,
            torch.linspace(
                0, frames.shape[1] - 1, frames.shape[1] // alpha
            ).long(),
        )
        frame_list = [slow_pathway, fast_pathway]
        return frame_list

transform =  ApplyTransformToKey(
    key="video",
    transform=Compose(
        [
            UniformTemporalSubsample(num_frames),
            Lambda(lambda x: x/255.0),
            NormalizeVideo(mean, std),
            ShortSideScale(
                size=side_size
            ),
            CenterCropVideo(crop_size),
            PackPathway()
        ]
    ),
)

# The duration of the input clip is also specific to the model.
clip_duration = (num_frames * sampling_rate)/frames_per_second
clip_duration

42.666666666666664

In [6]:
# # Load the example video
# video_path = "porevo.mp4"  

# # Select the duration of the clip to load by specifying the start and end duration
# # The start_sec should correspond to where the action occurs in the video
# start_sec = 90
# end_sec = start_sec + clip_duration 

# # Initialize an EncodedVideo helper class
# video = EncodedVideo.from_path(video_path)

# # Load the desired clip
# video_data = video.get_clip(start_sec=start_sec, end_sec=end_sec)

# # Apply a transform to normalize the video input
# video_data = transform(video_data)

In [7]:

# # Move the inputs to the desired device
# inputs = video_data["video"]
# inputs = [i.to(device)[None, ...] for i in inputs]

In [8]:
# # Pass the input clip through the model 
# preds = model(inputs)

In [9]:
# # Get the predicted classes 
# post_act = torch.nn.Softmax(dim=1)
# preds = post_act(preds)
# pred_classes = preds.topk(k=5).indices

# # Map the predicted classes to the label names
# pred_class_names = [kinetics_id_to_classname[int(i)] for i in pred_classes[0]]
# print("Predicted labels: %s" % ", ".join(pred_class_names))

In [11]:
def do_video():
    for i in range(10):
        # Load the example video
        video_path = "porevo.mp4"  

        # Select the duration of the clip to load by specifying the start and end duration
        # The start_sec should correspond to where the action occurs in the video
        start_sec = 90 + clip_duration * i
        end_sec = start_sec + clip_duration 

        # Initialize an EncodedVideo helper class
        video = EncodedVideo.from_path(video_path)

        # Load the desired clip
        video_data = video.get_clip(start_sec=start_sec, end_sec=end_sec)

        # Apply a transform to normalize the video input
        video_data = transform(video_data)

        # Move the inputs to the desired device
        inputs = video_data["video"]
        inputs = [i.to(device)[None, ...] for i in inputs]
        # Pass the input clip through the model 
        preds = model(inputs)
        # Get the predicted classes 
        post_act = torch.nn.Softmax(dim=1)
        preds = post_act(preds)
        pred_classes = preds.topk(k=5).indices

        # Map the predicted classes to the label names
        pred_class_names = [kinetics_id_to_classname[int(i)] for i in pred_classes[0]]
        print("Predicted labels: %s" % ", ".join(pred_class_names))

In [12]:
do_video()

Predicted labels: taking a shower, shaving legs, trimming or shaving beard, washing hair, shaving head
Predicted labels: shaving legs, washing hair, situp, getting a haircut, shaving head
Predicted labels: tickling, shaving legs, situp, washing hair, laughing
Predicted labels: tickling, shaving legs, washing hair, getting a tattoo, kissing
Predicted labels: washing hair, tickling, washing feet, shaving legs, kissing
Predicted labels: shaving legs, washing hair, washing feet, kissing, tickling
Predicted labels: trimming or shaving beard, applying cream, waxing chest, eating burger, waxing back
Predicted labels: yoga, shaving legs, eating hotdog, waxing legs, massaging feet
Predicted labels: yoga, trimming or shaving beard, drinking beer, shaving legs, eating hotdog
Predicted labels: trimming or shaving beard, shaving head, drinking beer, brush painting, getting a haircut
