In [46]:
import torch
import requests
from pytorchvideo.transforms import (
    ApplyTransformToKey,
    ShortSideScale,
    UniformTemporalSubsample,
)
import csv
from io import StringIO
import torch
import torch.nn.functional as F
import torchvision.transforms as T
from pytorchvideo.data.encoded_video import EncodedVideo
from torchvision.transforms._transforms_video import NormalizeVideo
from transforms import SpatialCrop, TemporalCrop, DepthNorm
import cv2
import glob
from PIL import Image

In [66]:
device = 'cpu'

In [61]:
def files_to_video(files):
    video_name = './video.mp4'
    frame = Image.open(files[0])
    video = cv2.VideoWriter(video_name, cv2.VideoWriter_fourcc(*'mp4v'), 24, (frame.width, frame.height))
    for file in files:
        video.write(cv2.imread(file))
    video.release()
    # return video

In [62]:
files_to_video(sorted(glob.glob('./satis-cv-ai-exercise-data/train/P01/P01_05/*.jpg')))

In [21]:
model_name = "omnivore_swinB_epic"
model = torch.hub.load("facebookresearch/omnivore:main", model=model_name)

# Set to eval mode and move to desired device
model = model.to(device)
model = model.eval()

Using cache found in /Users/vshmyhlo/.cache/torch/hub/facebookresearch_omnivore_main


In [22]:
response = requests.get('https://dl.fbaipublicfiles.com/omnivore/epic_action_classes.csv')
reader = csv.reader(StringIO(response.text))
epic_id_to_action = {idx: " ".join(rows) for idx, rows in enumerate(reader)}

In [28]:
num_frames = 32
sampling_rate = 2
frames_per_second = 30

clip_duration = (num_frames * sampling_rate) / frames_per_second

video_transform = ApplyTransformToKey(
    key="video",
    transform=T.Compose(
        [
            UniformTemporalSubsample(num_frames), 
            T.Lambda(lambda x: x / 255.0),  
            ShortSideScale(size=224),
            NormalizeVideo(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
            TemporalCrop(frames_per_clip=32, stride=40),
            SpatialCrop(crop_size=224, num_crops=3),
        ]
    ),
)

In [63]:
video_path = "./video.mp4" 

In [64]:
# Initialize an EncodedVideo helper class
video = EncodedVideo.from_path(video_path)

# Load the desired clip
video_data = video.get_clip(start_sec=0.0, end_sec=2.0)

# Apply a transform to normalize the video input
video_data = video_transform(video_data)

# Move the inputs to the desired device
video_inputs = video_data["video"]

# Take the first clip 
# The model expects inputs of shape: B x C x T x H x W
video_input = video_inputs[0][None, ...]

No accelerated colorspace conversion found from yuv420p to rgb24.


In [67]:
# Pass the input clip through the model 
with torch.no_grad():
    prediction = model(video_input.to(device), input_type="video")

    # Get the predicted classes 
    pred_classes = prediction.topk(k=5).indices

# Map the predicted classes to the label names
pred_class_names = [epic_id_to_action[int(i)] for i in pred_classes[0]]
print("Top 5 predicted actions: %s" % ", ".join(pred_class_names))

Top 5 predicted actions: take bread, take bag, put bread, insert bread, put bag
