# EDA Data

In [1]:
import os
import json
from typing import Dict, List
import pandas as pd

## Load WLASL dataset

In [2]:
def load_json(path: str) -> List[Dict]:
    with open(path, "r") as f:
        data = json.load(f)
    print(len(data))
    return data


wlasl = load_json("../data/raw/WLASL_v0.3.json")

2000


In [3]:
len(wlasl[0]['instances'])

40

In [15]:
wlasl[0]['instances'][5]

{'bbox': [110, 25, 274, 240],
 'fps': 25,
 'frame_end': 2249,
 'frame_start': 2150,
 'instance_id': 5,
 'signer_id': 121,
 'source': 'northtexas',
 'split': 'val',
 'url': 'https://www.youtube.com/watch?v=hjS0dQDgbjo',
 'variation_id': 0,
 'video_id': '70212'}

## Extract Videos

In [None]:
def extract_videos_by_ids(json_lst: List[Dict]) -> List[str]:
    video_path_lst = []
    for ins in json_lst:
        video_path = f"../data/raw/videos/{ins['video_id']}.mp4"
        if os.path.exists(f"{video_path}"):
            video_path_lst.append(video_path)
    return video_path_lst

video_path_lst, labels = [], []
frame_start, frame_end, split = [], [], []
for word_instances in wlasl:
    video_paths = extract_videos_by_ids(word_instances['instances'])
    video_path_lst += video_paths
    labels += len(video_paths) * [word_instances['gloss']]
    for instance in word_instances['instances']:
        frame_start.append(instance['frame_start'])
        frame_end.append(instance['frame_end'])
        split.append(instance['split'])

# print(len(video_path_lst))
# print(len(labels))
df = pd.DataFrame(data={
    "video_path": video_path_lst,
    "label": labels,
    'frame_start': frame_start,
    'frame_end': frame_end,
    'split': split,
})
df.head(20)

Unnamed: 0,video_path,labels
0,../data/raw/videos/69241.mp4,book
1,../data/raw/videos/07069.mp4,book
2,../data/raw/videos/07068.mp4,book
3,../data/raw/videos/07070.mp4,book
4,../data/raw/videos/07099.mp4,book
5,../data/raw/videos/07074.mp4,book
6,../data/raw/videos/69302.mp4,drink
7,../data/raw/videos/65539.mp4,drink
8,../data/raw/videos/17710.mp4,drink
9,../data/raw/videos/17733.mp4,drink


## Display Examples

In [None]:
import mediapy as media

ids = [2,8,27,45,78]
videos = [
    media.read_video(path)
    for path in df['video_path'][ids]
]
titles = [label for label in df['label'][ids]]
media.show_videos(videos, titles=titles, height=200)

0,1,2,3,4
book  This browser does not support the video tag.,drink  This browser does not support the video tag.,computer  This browser does not support the video tag.,before  This browser does not support the video tag.,who  This browser does not support the video tag.


## Preprocess Videos

In [6]:
import mediapipe as mp
import cv2
import time
import numpy as np

### Display Keypoint features

In [7]:
mp_drawing = mp.solutions.drawing_utils
mp_holistic = mp.solutions.holistic

winname = "Holistic Model Detection"
# cv2.namedWindow(winname)        # Create a named window
# cv2.moveWindow(winname, 40,30)  # Move it to (40,30)

def drawing_landmarks(frame, holistic_results):
    mp_drawing.draw_landmarks(frame, holistic_results.face_landmarks, mp_holistic.FACEMESH_CONTOURS)
    mp_drawing.draw_landmarks(frame, holistic_results.pose_landmarks, mp_holistic.POSE_CONNECTIONS)
    mp_drawing.draw_landmarks(frame, holistic_results.left_hand_landmarks, mp_holistic.HAND_CONNECTIONS)
    mp_drawing.draw_landmarks(frame, holistic_results.right_hand_landmarks, mp_holistic.HAND_CONNECTIONS)


In [8]:
video_path = df['video_path'][6]
print(video_path)

cap = cv2.VideoCapture(video_path)

width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
fps = int(cap.get(cv2.CAP_PROP_FPS))

print(width, height, frames, fps)

if not cap.isOpened(): 
    print("Error opening the video file.")

with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
    while cap.isOpened():
        success, frame = cap.read()
        if success is False:
            break

        frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

        results = holistic.process(frame)
        drawing_landmarks(frame, results)

        time.sleep(1/fps)

        cv2.imshow(winname, frame)
        if cv2.waitKey(1) == ord('q'):
            break

cap.release()
cv2.destroyAllWindows()

../data/raw/videos/69302.mp4
1920 1080 77 29


I0000 00:00:1734936465.708608    3972 gl_context_egl.cc:85] Successfully initialized EGL. Major : 1 Minor: 5
I0000 00:00:1734936465.735200    4854 gl_context.cc:369] GL version: 3.1 (OpenGL ES 3.1 Mesa 23.0.4-0ubuntu1~22.04.1), renderer: D3D12 (NVIDIA GeForce RTX 4070)
INFO: Created TensorFlow Lite XNNPACK delegate for CPU.
W0000 00:00:1734936465.805998    4825 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1734936465.842509    4831 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1734936465.853905    4824 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1734936465.856685    4836 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signatu

### Extract Keypoint Features

Using Holistic model to extract keypoint features of face, pose, left hand and right hand

In [9]:
def extract_keypoints(results):
    face = np.array([[res.x, res.y, res.z] for res in results.face_landmarks.landmark]).flatten() if results.face_landmarks else np.zeros(468 * 3)
    pose = np.array([[res.x, res.y, res.z, res.visibility] for res in results.pose_landmarks.landmark]).flatten() if results.pose_landmarks else np.zeros(33 * 4)
    left_hand = np.array([[res.x, res.y, res.z] for res in results.left_hand_landmarks.landmark]).flatten() if results.left_hand_landmarks else np.zeros(21 * 3)
    right_hand = np.array([[res.x, res.y, res.z] for res in results.right_hand_landmarks.landmark]).flatten() if results.right_hand_landmarks else np.zeros(21 * 3)
    return np.concatenate([face, pose, left_hand, right_hand])


def collect_feature(video_df: pd.DataFrame):
    # keypoint_features = []
    for i, (video_path, label) in enumerate(zip(video_df['video_path'], video_df['label'])):
        cap = cv2.VideoCapture(video_path)
        if not cap.isOpened(): 
            print("Error opening the video file.")

        with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
            keypoint_feature = []
            for _ in range(30):
                success, frame = cap.read()
                if success is False:
                    break

                frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

                results = holistic.process(frame)
                keypoint_feature.append(extract_keypoints(results))
                
                if cv2.waitKey(1) == ord('q'):
                    break
        # keypoint_features.append(np.stack(keypoint_feature))
        keypoint_features = np.stack(keypoint_feature)
        npy_path = os.path.join("../data/preprocessed/", label, f"sequence_{i}.npy")
        os.makedirs(os.path.dirname(npy_path), exist_ok=True)
        np.save(npy_path, keypoint_features)

        cap.release()
        cv2.destroyAllWindows()

In [10]:
collect_feature(df)

I0000 00:00:1734937573.008242    3972 gl_context_egl.cc:85] Successfully initialized EGL. Major : 1 Minor: 5
I0000 00:00:1734937573.032827   12622 gl_context.cc:369] GL version: 3.1 (OpenGL ES 3.1 Mesa 23.0.4-0ubuntu1~22.04.1), renderer: D3D12 (NVIDIA GeForce RTX 4070)
W0000 00:00:1734937573.088914   12606 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1734937573.114269   12616 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1734937573.119303   12616 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1734937573.120085   12610 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0

KeyboardInterrupt: 