In [1]:
import torch
import cv2
import numpy as np
import torch.nn as nn
from torchvision.transforms import ToTensor
import time

In [3]:
# Specify the height and width to which each video frame will be resized in our dataset.
IMAGE_HEIGHT , IMAGE_WIDTH = 64, 64

# Specify the number of frames of a video that will be fed to the model as one sequence.
SEQUENCE_LENGTH = 25

# Specify the directory containing the UCF50 dataset.
DATASET_DIR = "UCF50"

# Specify the list containing the names of the classes used for training. choose any set of classes.
CLASSES_LIST = ["WalkingWithDog", "TaiChi", "Swing", "HorseRace"]

def frames_extraction(video_path):
    '''
    This function will extract the required frames from a video after resizing and normalizing them.
    Args:
        video_path: The path of the video in the disk, whose frames are to be extracted.
    Returns:
        frames_list: A list containing the resized and normalized frames of the video.
    '''

    # Declare a list to store video frames.
    frames_list = []

    # Read the Video File using the VideoCapture object.
    video_reader = cv2.VideoCapture(video_path)

    # Get the total number of frames in the video.
    video_frames_count = int(video_reader.get(cv2.CAP_PROP_FRAME_COUNT))

    # Calculate the the interval after which frames will be added to the list.
    skip_frames_window = max(int(video_frames_count/SEQUENCE_LENGTH), 1)

    # Iterate through the Video Frames.
    for frame_counter in range(SEQUENCE_LENGTH):

        # Set the current frame position of the video.
        video_reader.set(cv2.CAP_PROP_POS_FRAMES, frame_counter * skip_frames_window)

        # Reading the frame from the video.
        success, frame = video_reader.read()

        # Check if Video frame is not successfully read then break the loop
        if not success:
            break

        # Resize the Frame to fixed height and width.
        resized_frame = cv2.resize(frame, (IMAGE_HEIGHT, IMAGE_WIDTH))

        # Normalize the resized frame by dividing it with 255 so that each pixel value then lies between 0 and 1
        normalized_frame = resized_frame / 255

        # Append the normalized frame into the frames list
        frames_list.append(normalized_frame)

    # Release the VideoCapture object.
    video_reader.release()

    # Return the frames list.
    return frames_list

# Extract the frames of the video file.
sequence_frames = frames_extraction('v_HorseRace_g01_c01.avi')

In [7]:
# Define the 3D CNN model in PyTorch
class Conv3DModel(nn.Module):
    def __init__(self, num_classes=len(CLASSES_LIST), num_frames=SEQUENCE_LENGTH):
        super(Conv3DModel, self).__init__()
        self.conv1 = nn.Conv3d(num_frames, 32, kernel_size=(3, 3, 3), padding=1)
        self.pool1 = nn.MaxPool3d(kernel_size=(1, 2, 2), stride=(1, 2, 2))
        self.dropout1 = nn.Dropout3d(0.4)

        self.conv2 = nn.Conv3d(32, 64, kernel_size=(3, 3, 3), padding=1)
        self.pool2 = nn.MaxPool3d(kernel_size=(1, 2, 2), stride=(1, 2, 2))
        self.dropout2 = nn.Dropout3d(0.4)

        self.conv3 = nn.Conv3d(64, 128, kernel_size=(3, 3, 3), padding=1)
        self.pool3 = nn.MaxPool3d(kernel_size=(1, 2, 2), stride=(1, 2, 2))
        self.dropout3 = nn.Dropout3d(0.4)

        self.flatten = nn.Flatten()
        self.fc1 = nn.Linear(128 * (SEQUENCE_LENGTH // 8) * (IMAGE_HEIGHT // 8) * (IMAGE_WIDTH // 8), 128)
        self.dropout4 = nn.Dropout(0.5)
        self.fc2 = nn.Linear(128, num_classes)

    def forward(self, x):
        x = self.conv1(x)
        x = self.pool1(x)
        x = self.dropout1(x)

        x = self.conv2(x)
        x = self.pool2(x)
        x = self.dropout2(x)

        x = self.conv3(x)
        x = self.pool3(x)
        x = self.dropout3(x)

        x = self.flatten(x)
        x = self.fc1(x)
        x = self.dropout4(x)
        x = self.fc2(x)

        return x

# Specify the device (GPU if available, else CPU)
#device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Move the model to the specified device
model = Conv3DModel()
#model = model.to(device)

# Load the pre-trained model
model_path = 'conv3D_model_best.pth'
model.load_state_dict(torch.load(model_path))
model.eval()

# Define the transform for input frames
transform = ToTensor()

def preprocess_video(video_path):
    frames_list = []
    video_reader = cv2.VideoCapture(video_path)

    for frame_counter in range(SEQUENCE_LENGTH):
        success, frame = video_reader.read()
        if not success:
            break

        resized_frame = cv2.resize(frame, (IMAGE_HEIGHT, IMAGE_WIDTH))
        normalized_frame = resized_frame / 255
        frames_list.append(normalized_frame)

    video_reader.release()
    frames_list = [transform(frame.astype(np.float32)) for frame in frames_list]
    frames_tensor = torch.stack(frames_list)
    return frames_tensor.unsqueeze(0)

# Replace 'input_video_path' with the path to your input video
input_video_path = 'v_Swing_g01_c01.avi'
input_data = preprocess_video(input_video_path)
#input_data = input_data.to(device)

# Perform inference
with torch.no_grad():
    output = model(input_data)

print(output.shape)    
    
# Get the predicted class index
_, predicted_class = torch.max(output, 1)
predicted_class_index = predicted_class.item()

# Map the index to the class name
predicted_class_name = CLASSES_LIST[predicted_class_index]

print(f"The model predicts the input video belongs to class: {predicted_class_name}")

torch.Size([1, 4])
The model predicts the input video belongs to class: Swing


In [8]:
# Perform inference and measure time
with torch.no_grad():
    start_time = time.time()
    output = model(input_data)
    end_time = time.time()

# Calculate inference time
inference_time = end_time - start_time

print(f"Inference time: {inference_time} seconds")

Inference time: 0.34319281578063965 seconds


In [6]:
# Get the model parameters
model_params = list(model.parameters())

# Calculate the total size of the model
total_size = sum(p.numel() for p in model_params)
total_size_mb = total_size * 4 / (1024 ** 2)  # Assuming 4 bytes for float32

print(f"Total model size: {total_size_mb:.2f} MB")

Total model size: 13.14 MB
