In [1]:
## Prepare for the pre-trained model

import torch
import json
from torchvision.transforms import Compose, Lambda
from torchvision.transforms._transforms_video import (
    CenterCropVideo,
    NormalizeVideo,
)
from pytorchvideo.data.encoded_video import EncodedVideo
from pytorchvideo.transforms import (
    ApplyTransformToKey,
    ShortSideScale,
    UniformTemporalSubsample,
    UniformCropVideo
)
from typing import Dict

# Device on which to run the model
# Set to cuda to load on GPU
device = "cuda"

# Pick a pretrained model and load the pretrained weights
model_name = "slowfast_r50"
model = torch.hub.load("facebookresearch/pytorchvideo", model=model_name, pretrained=True)

# Set to eval mode and move to desired device
model = model.to(device)
model = model.eval()

with open("kinetics_classnames.json", "r") as f:
    kinetics_classnames = json.load(f)

# Create an id to label name mapping
kinetics_id_to_classname = {}
for k, v in kinetics_classnames.items():
    kinetics_id_to_classname[v] = str(k).replace('"', "")


####################
# SlowFast transform
####################

side_size = 256
mean = [0.45, 0.45, 0.45]
std = [0.225, 0.225, 0.225]
crop_size = 256
num_frames = 32
sampling_rate = 2
frames_per_second = 30
alpha = 4

class PackPathway(torch.nn.Module):
    """
    Transform for converting video frames as a list of tensors.
    """
    def __init__(self):
        super().__init__()

    def forward(self, frames: torch.Tensor):
        fast_pathway = frames
        # Perform temporal sampling from the fast pathway.
        slow_pathway = torch.index_select(
            frames,
            1,
            torch.linspace(
                0, frames.shape[1] - 1, frames.shape[1] // alpha
            ).long(),
        )
        frame_list = [slow_pathway, fast_pathway]
        return frame_list

transform =  ApplyTransformToKey(
    key="video",
    transform=Compose(
        [
            UniformTemporalSubsample(num_frames),
            Lambda(lambda x: x/255.0),
            NormalizeVideo(mean, std),
            ShortSideScale(
                size=side_size
            ),
            CenterCropVideo(crop_size),
            PackPathway()
        ]
    ),
)

# The duration of the input clip is also specific to the model.
clip_duration = (num_frames * sampling_rate)/frames_per_second

Using cache found in C:\Users\36394/.cache\torch\hub\facebookresearch_pytorchvideo_main


In [2]:
# Define a function to run the pre-model on a video

def run_pretrained_model(video_path):
    # Load the example video

    # Select the duration of the clip to load by specifying the start and end duration
    # The start_sec should correspond to where the action occurs in the video
    start_sec = 0
    end_sec = start_sec + clip_duration

    # Initialize an EncodedVideo helper class
    video = EncodedVideo.from_path(video_path)

    # Load the desired clip
    video_data = video.get_clip(start_sec=start_sec, end_sec=end_sec)

    # Apply a transform to normalize the video input
    video_data = transform(video_data)

    # Move the inputs to the desired device
    inputs = video_data["video"]
    inputs = [i.to(device)[None, ...] for i in inputs]
    preds = model(inputs)

    # Get the predicted classes
    post_act = torch.nn.Softmax(dim=1)
    preds = post_act(preds)
    pred_classes = preds.topk(k=5).indices

    # Map the predicted classes to the label names
    pred_class_names = [kinetics_id_to_classname[int(i)] for i in pred_classes[0]]
    print("Predicted labels: %s" % ", ".join(pred_class_names))

# Test the function
video_path = "../path_save_video.mp4"
run_pretrained_model(video_path)

Predicted labels: breakdancing, yoga, tai chi, dancing ballet, robot dancing


In [6]:
# Delete the bad data
import os
# Define the dataset folder
dir_10s = r'C:\Users\36394\Study\GWU\PHD in Biomedical Engineer\Research\FOS\Autism_dataset\Data_processed'
dir_vision = r'C:\Users\36394\Study\GWU\PHD in Biomedical Engineer\Research\FOS\Autism_dataset\Vision_dataset'
dir_train = os.path.join(dir_vision, 'train')
dir_val = os.path.join(dir_vision, 'val')

g = os.walk(dir_vision)
num_bad_video = 0
num_video = 0
for path,dir_list,file_list in g:
    for file_name in file_list:
        path_video = os.path.join(path, file_name)
        try:
            video = EncodedVideo.from_path(path_video)
        except:
            print('Error: {}'.format(os.path.basename(path_video)))
            num_bad_video += 1
            os.remove(path_video)
        num_video += 1

In [10]:
import os
# Define the dataset folder
dir_10s = r'C:\Users\36394\Study\GWU\PHD in Biomedical Engineer\Research\FOS\Autism_dataset\Data_processed'
dir_vision = r'C:\Users\36394\Study\GWU\PHD in Biomedical Engineer\Research\FOS\Autism_dataset\Vision_dataset'
dir_train = os.path.join(dir_vision, 'train')
dir_val = os.path.join(dir_vision, 'val')

g = os.walk(dir_vision)
num_bad_video = 0
num_video = 0
for path,dir_list,file_list in g:
    for file_name in file_list:
        path_video = os.path.join(path, file_name)
        try:
            run_pretrained_model(path_video)
        except:
            print('Error: {}'.format(os.path.basename(path_video)))
            num_bad_video += 1
        num_video += 1
        

            

Predicted labels: reading book, opening present, cutting nails, hugging, reading newspaper
Predicted labels: washing feet, reading book, reading newspaper, cleaning shoes, playing keyboard
Error: C+_10.mp4
Predicted labels: dunking basketball, passing American football (not in game), playing basketball, throwing ball, drop kicking
Predicted labels: blowing out candles, playing keyboard, playing xylophone, opening present, reading book
Predicted labels: clean and jerk, deadlifting, snatch weight lifting, bench pressing, squat
Predicted labels: somersaulting, drop kicking, breakdancing, faceplanting, tickling
Predicted labels: bowling, opening present, waiting in line, auctioning, playing monopoly
Predicted labels: blowing out candles, making a sandwich, setting table, making sushi, folding clothes
Predicted labels: wrapping present, unboxing, folding paper, reading book, opening present
Predicted labels: blowing out candles, setting table, pushing car, headbutting, shaking head
Predicte

In [9]:
print('Number of bad video: {}'.format(num_bad_video))
print('Number of video: {}'.format(num_video))
print('Percentage of bad video: {}'.format(num_bad_video/num_video))

Number of bad video: 95
Number of video: 3227
Percentage of bad video: 0.02943910753021382
