In [1]:
import torchvision
import torch

from torchvision import transforms
from tqdm.notebook import tqdm
from copy import deepcopy
from PIL import Image

import matplotlib.pyplot as plt
import numpy as np
import utils
import time
import cv2
import os

CONSUMER_PATH = './videos/consumer/'
GUIDE_PATH = './videos/guide/'

In [2]:
device = torch.device('cpu')
model = torchvision.models.detection.keypointrcnn_resnet50_fpn(pretrained=True, num_keypoints=17)
model.to(device).eval()



KeypointRCNN(
  (transform): GeneralizedRCNNTransform(
      Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
      Resize(min_size=(640, 672, 704, 736, 768, 800), max_size=1333, mode='bilinear')
  )
  (backbone): BackboneWithFPN(
    (body): IntermediateLayerGetter(
      (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
      (bn1): FrozenBatchNorm2d(64, eps=0.0)
      (relu): ReLU(inplace=True)
      (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
      (layer1): Sequential(
        (0): Bottleneck(
          (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn1): FrozenBatchNorm2d(64, eps=0.0)
          (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
          (bn2): FrozenBatchNorm2d(64, eps=0.0)
          (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn3): FrozenBatchNorm2d(256, eps=0.

In [3]:
transform = transforms.Compose([
    transforms.ToTensor(),
])

# Load Datasets

### Consumer datasets

In [4]:
consumer_file_lists = os.listdir(CONSUMER_PATH)
consumer_file_lists = [os.path.join(CONSUMER_PATH, f) for f in consumer_file_lists][0]
print(f"Consumer file lists: {consumer_file_lists}")

Consumer file lists: ./videos/consumer/consumer_1.mp4


In [5]:
consumer_video = cv2.VideoCapture(consumer_file_lists)
consumer_frame_width = int(consumer_video.get(3))
consumer_frame_height = int(consumer_video.get(4))

consumer_video_path = CONSUMER_PATH + 'processed_videos_consumer.mp4'
consumer_video_write = cv2.VideoWriter(
    consumer_video_path,                # output file name
    cv2.VideoWriter_fourcc(*'MP4V'),    # codec
    20.0,                               # fps
    (consumer_frame_width, consumer_frame_height) # frame size
)

OpenCV: FFMPEG: tag 0x5634504d/'MP4V' is not supported with codec id 12 and format 'mp4 / MP4 (MPEG-4 Part 14)'
OpenCV: FFMPEG: fallback to use tag 0x7634706d/'mp4v'


### Guide datasets

In [6]:
guide_file_list = os.listdir(GUIDE_PATH)
guide_file_list = [os.path.join(GUIDE_PATH, file) for file in guide_file_list]
guide_file_list = [file for file in guide_file_list if file.endswith('.mp4')][0]

print(f"Guide file list: {guide_file_list}")

Guide file list: ./videos/guide/guide_1.mp4


In [7]:
guide_video = cv2.VideoCapture(guide_file_list)
guide_frame_width = int(guide_video.get(3))
guide_frame_height = int(guide_video.get(4))

guide_video_path = GUIDE_PATH + 'processed_videos_guide.mp4'
guide_video_write = cv2.VideoWriter(
    guide_video_path,                   # Path to the output video file
    cv2.VideoWriter_fourcc(*'mp4v'),    # Codec to be used
    20.0,                               # Frame rate of the video
    (guide_frame_width, guide_frame_height) # Frame size
)

# Guide Video Feature Extraction

In [9]:
total_fps, frame_count = 0, 0
guide_skeleton_list = []
pbar = tqdm(desc=f"Extracting skeleton from guide video", total=guide_video.get(cv2.CAP_PROP_FRAME_COUNT))

while True:
    ret, frame = guide_video.read()
    if ret == True:
        pbar.update(1)

        # Get frames from video, and convert to PIL image for processing with OpenPose
        image_from_video = np.array(frame, dtype=np.float32)
        original_image = deepcopy(image_from_video)

        # Transform the image to tensor
        image_from_video = torch.Tensor(image_from_video).permute(2, 0, 1)
        image_from_video = image_from_video.unsqueeze(0).to(device)

        # Get the output from the model
        start_time = time.time()
        with torch.no_grad():
            output = model(image_from_video)
        latency = time.time() - start_time

        # Get the keypoint from the output
        keypoints = utils.get_keypoints(output, original_image, threshold=0.9)
        guide_skeleton_list.append(keypoints)

        fps = 1 / latency 
        total_fps += fps
        frame_count += 1
        pbar.set_postfix({"FPS": fps, "Avg FPS": total_fps / frame_count})

    else:
        break

pbar.close()

guide_skeleton_list = np.array(guide_skeleton_list)
guide_skeleton_list = guide_skeleton_list.astype(np.float32)
print(f"Guide skeleton list shape: {guide_skeleton_list.shape}")

Extracting skeleton from guide video:   0%|          | 0/546.0 [00:00<?, ?it/s]

Guide skeleton list shape: (0,)


In [10]:
total_fps, frame_count = 0, 0
consumer_skeleton_list = []
pbar = tqdm(desc=f"Extracting skeleton from consumer video", total=guide_video.get(cv2.CAP_PROP_FRAME_COUNT))

while True:
    ret, frame = consumer_video.read()
    if ret == True:
        pbar.update(1)

        # Get frames from video, and convert to PIL image for processing with OpenPose
        image_from_video = np.array(frame, dtype=np.float32)
        original_image = deepcopy(image_from_video)

        # Transform the image to tensor
        image_from_video = torch.Tensor(image_from_video).permute(2, 0, 1)
        image_from_video = image_from_video.unsqueeze(0).to(device)

        # Get the output from the model
        start_time = time.time()
        with torch.no_grad():
            output = model(image_from_video)
        latency = time.time() - start_time

        # Get the keypoint from the output
        keypoints = utils.get_keypoints(output, original_image, threshold=0.9)
        consumer_skeleton_list.append(keypoints)

        fps = 1 / latency 
        total_fps += fps
        frame_count += 1
        pbar.set_postfix({"FPS": fps, "Avg FPS": total_fps / frame_count})

    else:
        break

pbar.close()

consumer_skeleton_list = np.array(consumer_skeleton_list)
consumer_skeleton_list = consumer_skeleton_list.astype(np.float32)
print(f"Consumer skeleton list shape: {consumer_skeleton_list.shape}")

Extracting skeleton from consumer video:   0%|          | 0/546.0 [00:00<?, ?it/s]