In [1]:
from IPython.display import clear_output

import json
import os
import glob
from tqdm import tqdm
import time

import numpy as np
import pandas as pd

from utils.images_from_video import video_to_images

from utils.keypoints import read_keypoints, read_rescale_keypoints_list, rescale_keypoints_sequence, rescale_keypoints
from model.transforms import KeypointsSequencePadding

import torch
import torch.nn.functional as F
from model.models import StopPoseNet, SequenceRecognitionNet, SequenceRecognitionNetLSTM

import cv2

In [2]:
pd.set_option('display.max_colwidth', 200)

In [3]:
# Input video params
INPUT_VIDEO_PATH = './data/video_raw/internal_resource_2333.MOV'
VIDEO_ROTATE = True
VIDEO_OUTPUT_FPS = 10
VIDEO_START_FRAME = 40
VIDEO_END_FRAME = 520

# Working directory params
OUTPUT_FOLDER = './data/video_translation/'
TEMP_BASE_FOLDER_NAME = 'tmp'
TEMP_FRAMES_FOLDER_NAME = 'frames'
TEMP_KEYPOINTS_FOLDER_NAME = 'keypoints'
TEMP_VIDEO_RAW_FOLDER_NAME = 'sequences'
TEMP_VIDEO_RENDERED_FOLDER_NAME = 'video_rendered'
OUTPUT_BASE_FOLDER_NAME = 'output'

# Model paths
STOP_POSE_NET_PATH = './model/stop_pose_detector.pt'
SEQUENCE_NET_PATH = './model/sequence_classifier.pt'

# Model params
NUM_CLASSES = 11

# Transformer params
SEQUENCE_LENGTH_MAX = 50

# Pytorch params
DEVICE = torch.device('cpu')

In [4]:
# Prepare directories
video_file_name = os.path.basename(INPUT_VIDEO_PATH).split('.')[0]

for dir_path in [TEMP_FRAMES_FOLDER_NAME, TEMP_KEYPOINTS_FOLDER_NAME, TEMP_VIDEO_RAW_FOLDER_NAME, TEMP_VIDEO_RENDERED_FOLDER_NAME]:
    dir_path = os.path.join(OUTPUT_FOLDER, TEMP_BASE_FOLDER_NAME, video_file_name, dir_path)
    
    if not os.path.exists(dir_path):
        print(f'Creating directory: {dir_path}')
        os.makedirs(dir_path)
    else:
        print(f'Removing content of directory:{dir_path}')
        dir_content = glob.glob(f'{dir_path}/*')
        for file in dir_content:
            os.remove(file)

Removing content of directory:./data/video_translation/tmp/internal_resource_2343/frames
Removing content of directory:./data/video_translation/tmp/internal_resource_2343/keypoints
Removing content of directory:./data/video_translation/tmp/internal_resource_2343/sequences
Removing content of directory:./data/video_translation/tmp/internal_resource_2343/video_rendered


In [5]:
# Split video into images
start_time = time.time()
video_to_images(
    video_path=INPUT_VIDEO_PATH,
    output_dir=os.path.join(OUTPUT_FOLDER, TEMP_BASE_FOLDER_NAME, video_file_name, TEMP_FRAMES_FOLDER_NAME),
    file_name_prefix=video_file_name,
    output_fps=VIDEO_OUTPUT_FPS,
    rotate=VIDEO_ROTATE,
    create_subdir=False,
    start_frame=VIDEO_START_FRAME,
    end_frame=VIDEO_END_FRAME
)
clear_output()
print(f'Done. It took {time.time() - start_time} seconds.')

Done. It took 112.56163263320923 seconds.


In [6]:
# Transform images into keypoints

print(
    f"""docker run -it --rm --mount type=bind,source={os.path.realpath(os.path.join(OUTPUT_FOLDER, TEMP_BASE_FOLDER_NAME, video_file_name))},target=/sign-language/data --mount type=bind,source={os.path.realpath(os.path.join('.', 'utils'))},target=/sign-language/utils --net=host -e DISPLAY --runtime=nvidia openpose-custom"""
)

print(
    f"""\npython3 ./utils/openpose_wrapper.py --image_dir ./data/frames --write_json ./data/keypoints --write_images ./data/rendered_images"""
)

input('\nPress enter when you finished with docker execution')

docker run -it --rm --mount type=bind,source=/home/tom/Desktop/projects/sign-language/data/video_translation/tmp/internal_resource_2343,target=/sign-language/data --mount type=bind,source=/home/tom/Desktop/projects/sign-language/utils,target=/sign-language/utils --net=host -e DISPLAY --runtime=nvidia openpose-custom

python3 ./utils/openpose_wrapper.py --image_dir ./data/frames --write_json ./data/keypoints --write_images ./data/rendered_images



Press enter when you finished with docker execution 


''

In [7]:
# Load list of keypoints
keypoints_dir_path = os.path.join(OUTPUT_FOLDER, TEMP_BASE_FOLDER_NAME, video_file_name, TEMP_KEYPOINTS_FOLDER_NAME)
keypoints_paths = glob.glob(f'{keypoints_dir_path}/*.json')
keypoints_paths.sort()

print(f'Number of frames: {len(keypoints_paths)}')

Number of frames: 521


In [8]:
# Load and rescale keypoints
keypoints_all = read_rescale_keypoints_list(keypoints_paths)

100%|██████████| 521/521 [00:00<00:00, 10779.72it/s]


In [20]:
### Get keypoints sequences
start_time = time.time()

# Load to tensor
keypoints_all_tensor = torch.FloatTensor(keypoints_all).to(DEVICE)

# Load stop_pose model
state_dict = torch.load(STOP_POSE_NET_PATH)
stop_pose_net = StopPoseNet().to(DEVICE)
stop_pose_net.load_state_dict(state_dict)
stop_pose_net.eval()

# Make predictions
stop_pose_preds = np.zeros((keypoints_all_tensor.shape[0], 2))

stop_pose_preds[:] = stop_pose_net(keypoints_all_tensor[:]).sigmoid().detach().cpu().numpy()
test = np.copy(stop_pose_preds)
stop_pose_preds = np.argmax(stop_pose_preds, axis=1)
print(f'Predictions shape: {stop_pose_preds.shape}')

# Get sequences
def extract_sequences(stop_pose_preds):
    # Create an array that is 1 where a is 0, and pad each end with an extra 0.
    is_one = np.concatenate(([0], np.equal(stop_pose_preds, 0).view(np.int8), [0]))
    diff_abs = np.abs(np.diff(is_one))
    
    # Runs start and end where absdiff is 1.
    ranges = np.where(diff_abs == 1)[0].reshape(-1, 2)
    return ranges

sequences_idx = extract_sequences(stop_pose_preds)
print(f'Number of sequences: {sequences_idx.shape[0]}')

print(f'\nDone. It took {time.time() - start_time} seconds.')

Predictions shape: (521,)
Number of sequences: 1

Done. It took 0.02443981170654297 seconds.


In [21]:
test

array([[9.99566615e-01, 4.12081921e-04],
       [9.96641755e-01, 3.32559855e-03],
       [8.91152680e-01, 1.00755297e-01],
       ...,
       [7.66190529e-01, 2.07118630e-01],
       [7.59080708e-01, 2.14096904e-01],
       [7.69138038e-01, 2.05261633e-01]])

In [10]:
### Generate video of raw sequences

# Set output video params
video_output_dir = os.path.join(OUTPUT_FOLDER, TEMP_BASE_FOLDER_NAME, video_file_name, 'sequences')
fps = 10

# Create df with frames
df_frames = pd.DataFrame(data=keypoints_paths, columns=['file_path'])
df_frames.file_path = df_frames.file_path.apply(lambda x: os.path.realpath(x.replace('_keypoints.json', '.jpg').replace('keypoints', 'frames')))
df_frames = df_frames.sort_values('file_path')

# Generate video for each sequence
for i, sequence_range in tqdm(enumerate(sequences_idx)):
    video_name = f'internal_resource_{sequence_range[0]}_{sequence_range[1]}.avi'
    video_path = os.path.join(os.path.realpath(video_output_dir), video_name)
    
    images = df_frames.iloc[sequence_range[0]: sequence_range[1]]['file_path'].to_numpy().tolist()
    frame = cv2.imread(images[0])
    height, width, layers = frame.shape
    
    video = cv2.VideoWriter(video_path, 0, fps, (width,height))

    for image_path in images:
        video.write(cv2.imread(image_path))

    video.release()

1it [00:13, 13.54s/it]


In [11]:
df_frames

Unnamed: 0,file_path
0,/home/tom/Desktop/projects/sign-language/data/video_translation/tmp/internal_resource_2343/frames/internal_resource_2343_000070.jpg
1,/home/tom/Desktop/projects/sign-language/data/video_translation/tmp/internal_resource_2343/frames/internal_resource_2343_000071.jpg
2,/home/tom/Desktop/projects/sign-language/data/video_translation/tmp/internal_resource_2343/frames/internal_resource_2343_000072.jpg
3,/home/tom/Desktop/projects/sign-language/data/video_translation/tmp/internal_resource_2343/frames/internal_resource_2343_000073.jpg
4,/home/tom/Desktop/projects/sign-language/data/video_translation/tmp/internal_resource_2343/frames/internal_resource_2343_000074.jpg
...,...
516,/home/tom/Desktop/projects/sign-language/data/video_translation/tmp/internal_resource_2343/frames/internal_resource_2343_000586.jpg
517,/home/tom/Desktop/projects/sign-language/data/video_translation/tmp/internal_resource_2343/frames/internal_resource_2343_000587.jpg
518,/home/tom/Desktop/projects/sign-language/data/video_translation/tmp/internal_resource_2343/frames/internal_resource_2343_000588.jpg
519,/home/tom/Desktop/projects/sign-language/data/video_translation/tmp/internal_resource_2343/frames/internal_resource_2343_000589.jpg


In [12]:
# Create sequences tensor array
sequences_tensor_list = []

for sequence_range in sequences_idx:
    sequence_tensor = keypoints_all_tensor[sequence_range[0]:sequence_range[1]]
    sequences_tensor_list.append(sequence_tensor)
    print(sequence_tensor.shape)

torch.Size([521, 57, 2])


In [13]:
### Predict sequences

# Load stop_pose model
state_dict = torch.load(SEQUENCE_NET_PATH)
sequence_net = SequenceRecognitionNet(NUM_CLASSES)
sequence_net.load_state_dict(state_dict)
sequence_net.eval()

# Create sequence padding transformer
sequence_padder = KeypointsSequencePadding(SEQUENCE_LENGTH_MAX)

# Sequence iteration - padding and prediction
y_preds_proba = []
y_preds = []
for sequence_tensor in sequences_tensor_list:
    if sequence_tensor.shape[0] < 5:
        continue
    
    # Perform last-frame padding
    X = torch.Tensor(sequence_padder(sequence_tensor)).unsqueeze(0).float().to(DEVICE)
    print(X.shape)
    y_pred_proba = F.softmax(sequence_net(X), dim=1)
    y_pred_proba = y_pred_proba.detach().cpu().numpy()
    y_pred = y_pred_proba.argmax(axis=1)[0]
    
    y_preds_proba.append(y_pred_proba)
    y_preds.append(y_pred)

torch.Size([1, 50, 57, 2])


In [14]:
y_preds

[9]

In [15]:
label_map = {
    0: 'Meet',
    1: 'Name',
    2: 'Good day',
    3: 'See you around',
    4: 'Thank you',
    5: 'Hello',
    6: 'Bye bye',
    7: 'Tom',
    8: 'Nice',
    9: 'You',
    10: 'My'
} 

for row_index, pred_label in enumerate(y_preds):
    print(f'\nSequence: {row_index + 1}, label={label_map[pred_label]}')
    for col_index in range(len(label_map)):
        print(f'{label_map[col_index]}: {y_preds_proba[row_index][0][col_index]:.08f}')



Sequence: 1, label=You
Meet: 0.00004011
Name: 0.05519281
Good day: 0.01193496
See you around: 0.00001312
Thank you: 0.00001720
Hello: 0.00151290
Bye bye: 0.00000637
Tom: 0.02031478
Nice: 0.00028716
You: 0.91066819
My: 0.00001241


In [16]:
# Video ensemble based on rendered images with subtitles
# Set output video params
y_preds = [5, 2, 10, 1, 7, 0, 9, 8, 4, 3, 6]
video_output_dir = os.path.join(OUTPUT_FOLDER, TEMP_BASE_FOLDER_NAME, video_file_name, TEMP_VIDEO_RENDERED_FOLDER_NAME)
fps = 10

# Load list of rendered images
rendered_images_dir_path = os.path.join(OUTPUT_FOLDER, TEMP_BASE_FOLDER_NAME, video_file_name, 'rendered_images')
rendered_images_paths = glob.glob(f'{rendered_images_dir_path}/*.png')
rendered_images_paths.sort()

df_frames = pd.DataFrame(data=rendered_images_paths, columns=['rendered_image_path'])

# Generate video for each sequence
video_name = f'{video_file_name}_skeleton_translation.avi'
video_path = os.path.join(os.path.realpath(video_output_dir), video_name)

images = df_frames['rendered_image_path'].to_numpy().tolist()
height, width, layers = 1920, int(1080 * 2), 3 # frame.shape

fourcc=cv2.VideoWriter_fourcc('X', 'V', 'I', 'D')
video = cv2.VideoWriter(video_path, fourcc, fps, (width,height))

# Get unscaled keypoints
keypoints_all_unscaled = []
for keypoints_path in keypoints_paths:
    keypoints_all_unscaled.append(read_keypoints(keypoints_path))
keypoints_all_unscaled = np.array(keypoints_all_unscaled)

# Get bounding box coordinates and size
boxes_min = np.zeros([len(keypoints_all_unscaled), 2])
boxes_max = np.zeros([len(keypoints_all_unscaled), 2])

for sequence_range in sequences_idx:
    if (sequence_range[1] - sequence_range[0]) < 5:
        continue
    keypoints_unscaled_sequence = keypoints_all_unscaled[sequence_range[0]:sequence_range[1]]
    
    box_min = np.min(np.min(keypoints_unscaled_sequence, axis=1), axis=0).reshape(1, -1)
    box_max = np.max(np.max(keypoints_unscaled_sequence, axis=1), axis=0).reshape(1, -1)
    boxes_min[sequence_range[0]: sequence_range[1]] = box_min
    boxes_max[sequence_range[0]: sequence_range[1]] = box_max

# Bound labels to image
image_labels = np.zeros([len(keypoints_all_unscaled), 1])
image_labels[:] = np.array([-1])

seq_iter = 0
for sequence_range in sequences_idx:
    if (sequence_range[1] - sequence_range[0]) < 5:
        continue
    image_labels[sequence_range[0]:sequence_range[1]] = y_preds[seq_iter]
    seq_iter += 1
    
for i, image_path in tqdm(enumerate(images)):
    
    # Read image
    image = cv2.imread(image_path)
    
    top = 0
    bottom = 0
    left = int(1 * image.shape[1])
    right = 0
    
    new_image = cv2.copyMakeBorder(image, top, bottom, left, right, cv2.BORDER_CONSTANT)
        
    # Add bounding box
    cv2.rectangle(
        new_image,
        (int(boxes_min[i][0] + 1080), int(boxes_min[i][1])),
        (int(boxes_max[i][0] + 1080), int(boxes_max[i][1])),
        (0,255,0),
        (5)
    )
    cv2.rectangle(
        new_image,
        (0, 900),
        (1080, 1300),
        (0,0,0),
        (-1)
    )

    # Add text
    font=cv2.FONT_HERSHEY_PLAIN

    stop_pose_label = stop_pose_preds[i]
    stop_pose_label = 'Yes' if stop_pose_label == 1 else 'No'
    
    frame_label = image_labels[i][0]
    if frame_label == -1:
        frame_label = '<ignored sequence>'
    else:
        frame_label = label_map[image_labels[i][0]]
    
    cv2.putText(new_image, f"Frame: {i}/{len(images)}", (250,900), font, 5, (255,255,255), 2, cv2.LINE_AA)
    cv2.putText(new_image, f"FPS: {VIDEO_OUTPUT_FPS}", (250,975), font, 5, (255,255,255), 2, cv2.LINE_AA)
    cv2.putText(new_image,f"Stop pose: {stop_pose_label}", (250,1200), font, 5, (255,255,255), 2, cv2.LINE_AA)
    cv2.putText(new_image, f"{frame_label}", (250,1300), font, 5, (255,255,255), 2, cv2.LINE_AA)
        
    # Write frame to video
    video.write(np.uint8(new_image))

video.release()

521it [00:34, 15.12it/s]


In [17]:
# Video ensemble based on rendered images with subtitles
# Set output video params
video_output_dir = os.path.join(OUTPUT_FOLDER, TEMP_BASE_FOLDER_NAME, video_file_name, TEMP_VIDEO_RENDERED_FOLDER_NAME)
fps = 10

# Load list of rendered images
rendered_images_dir_path = os.path.join(OUTPUT_FOLDER, TEMP_BASE_FOLDER_NAME, video_file_name, 'rendered_images')
rendered_images_paths = glob.glob(f'{rendered_images_dir_path}/*.png')
rendered_images_paths.sort()
    
df_frames = pd.DataFrame(data=rendered_images_paths, columns=['rendered_image_path'])

# Generate video for each sequence
video_name = f'{video_file_name}_skeleton.avi'
video_path = os.path.join(os.path.realpath(video_output_dir), video_name)

images = df_frames['rendered_image_path'].to_numpy().tolist()
frame = cv2.imread(images[0])
height, width, layers = frame.shape

video = cv2.VideoWriter(video_path, 0, fps, (width,height))

for i, image_path in tqdm(enumerate(images)):
    # Read image
    image = cv2.imread(image_path)

    # Write frame to video
    video.write(image)

video.release()

521it [00:26, 19.82it/s]
