In [2]:
from pydrive2.auth import GoogleAuth
from pydrive2.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
from kaggle_secrets import UserSecretsClient

user_secrets = UserSecretsClient()
gdrive_kaggle_wl_ar_sl = user_secrets.get_secret("gdrive_kaggle_wl_ar_sl")

auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

def upload_to_gdrive(file_name, file_content):
    file_metadata = {
        'title': file_name,
        'parents': [{'id': gdrive_kaggle_wl_ar_sl}]
    }
    file = drive.CreateFile(file_metadata)
    file.SetContentString(file_content)
    file.Upload() # Files.insert()

In [59]:
!export TF_CPP_MIN_LOG_LEVEL=2
!pip install -q opencv-python mediapipe matplotlib
!wget -O KARSL-502_Labels.xlsx -q https://github.com/issamjebnouni/Arabic-Word-level-Sign-Language-Recognition/raw/refs/heads/main/KARSL-502_Labels.xlsx
!wget -O hand_landmarker.task -q https://storage.googleapis.com/mediapipe-models/hand_landmarker/hand_landmarker/float16/1/hand_landmarker.task
!wget -O pose_landmarker.task -q https://storage.googleapis.com/mediapipe-models/pose_landmarker/pose_landmarker_heavy/float16/1/pose_landmarker_heavy.task
!wget -O face_landmarker_v2_with_blendshapes.task -q https://storage.googleapis.com/mediapipe-models/face_landmarker/face_landmarker/float16/1/face_landmarker.task

In [None]:
def upload_to_gdrive(file_name, file_content):
    file_metadata = {
        'title': file_name,
        'parents': [{'id': gdrive_kaggle_wl_ar_sl}]
    }
    file = drive.CreateFile(file_metadata)
    file.SetContentString(file_content)
    file.Upload() # Files.insert()

In [189]:
import os
import cv2
import numpy as np
import mediapipe as mp
from tqdm.notebook import tqdm
from collections import defaultdict
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' # '2' suppresses warnings and info messages
os_join = os.path.join

DATA_DIR = "/kaggle/input/karsl-502"
KPS_DIR = "/kaggle/working/karsl-kps"

from mediapipe.tasks.python import BaseOptions, vision
VisionRunningMode = vision.RunningMode

mp_pose_options = vision.PoseLandmarkerOptions(
    base_options=BaseOptions(model_asset_path='pose_landmarker.task'),
    running_mode=VisionRunningMode.VIDEO
)
mp_face_options = vision.FaceLandmarkerOptions(
    base_options=BaseOptions(model_asset_path='face_landmarker_v2_with_blendshapes.task'),
    running_mode=VisionRunningMode.VIDEO,
    num_faces=1
)
mp_hands_options = vision.HandLandmarkerOptions(
    base_options=BaseOptions(model_asset_path='hand_landmarker.task'),
    running_mode=VisionRunningMode.VIDEO,
    num_hands=2
)

mp_pose_nose_idx = mp.solutions.pose.PoseLandmark.NOSE
mp_face_nose_idx = sorted(mp.solutions.face_mesh_connections.FACEMESH_NOSE)[0][0]
mp_hand_wrist_idx = mp.solutions.hands.HandLandmark.WRIST

pose_kps_idx = tuple(
    (
        mp.solutions.pose.PoseLandmark.LEFT_SHOULDER,
        mp.solutions.pose.PoseLandmark.RIGHT_SHOULDER,
        mp.solutions.pose.PoseLandmark.LEFT_ELBOW,
        mp.solutions.pose.PoseLandmark.RIGHT_ELBOW,
        mp.solutions.pose.PoseLandmark.LEFT_WRIST,
        mp.solutions.pose.PoseLandmark.RIGHT_WRIST,
    )
)
face_kps_idx = tuple(
    sorted(
        set(
            point
            for edge in [
                *mp.solutions.face_mesh_connections.FACEMESH_CONTOURS,
                *mp.solutions.face_mesh_connections.FACEMESH_IRISES,
            ]
            for point in edge
        )
    )
)
hand_kps_idx = tuple(range(len(mp.solutions.hands.HandLandmark)))

POSE_NUM = len(pose_kps_idx)
FACE_NUM = len(face_kps_idx)
HAND_NUM = len(hand_kps_idx)

KP2SLICE = {
    "pose": slice(0, POSE_NUM),
    "face": slice(POSE_NUM, POSE_NUM + FACE_NUM),
    "rh": slice(POSE_NUM + FACE_NUM, POSE_NUM + FACE_NUM + HAND_NUM),
    "lh": slice(POSE_NUM + FACE_NUM + HAND_NUM, POSE_NUM + FACE_NUM + HAND_NUM * 2),
}
POSE_KPS2IDX = {kps: idx for idx, kps in enumerate(pose_kps_idx)}
FACE_KPS2IDX = {kps: idx for idx, kps in enumerate(face_kps_idx)}
HAND_KPS2IDX = {kps: idx for idx, kps in enumerate(hand_kps_idx)}
KPS2IDX = {"pose": POSE_KPS2IDX, "face": FACE_KPS2IDX, "hand": HAND_KPS2IDX}


# usage: use it to draw mediapipe connections with the kps loaded from `.npy`arrays
for u, v in list(mp.solutions.face_mesh_connections.FACEMESH_IRISES)[:3]:
    print(face_kps_idx[FACE_KPS2IDX[u]], face_kps_idx[FACE_KPS2IDX[v]])


475 476
477 474
469 470


In [None]:
def get_karsl_words_min_frames_cnt():
    in_dir = "/kaggle/input/karsl-502"
    words_frames = defaultdict(lambda: (0, None))
    for signer in tqdm(["01", "02", "03"], desc="signer"):
        signer_dir = os_join(in_dir, signer, signer)

        for split in tqdm(["train", "test"], desc="split", leave=False):
            split_dir = os_join(signer_dir, split)

            for word in tqdm(range(1, 503), desc="words", leave=False):
                frames = (999, None)
                word_dir = os_join(split_dir, f"{word:04}")

                for rep in os.listdir(word_dir):
                    frames_dir = os_join(word_dir, rep)
                    frames_cnt = len(os.listdir(frames_dir))
                    if frames_cnt < frames[0]:
                        frames = (frames_cnt, frames_dir)

                if frames[0] > words_frames[word][0]:
                    words_frames[word] = frames
    return words_frames


# words_frames = get_karsl_words_min_frames_cnt()

In [139]:
bad_samples = [
    # this sample has >260 frames, and after inspection it has many unrelated frames, so just drop it
    'karsl-502/02/02/train/0443/03_02_0443_(15_11_17_15_52_07)_c',
]

MS_30FPS = 1000/30
PAD_TKN = -1
SEQ_LEN = 80

In [155]:
image_dir = '/kaggle/input/karsl-502/01/01/train/0001/01_01_0001_(10_11_16_16_21_34)_c/'
image_path = os.path.join(image_dir, os.listdir(image_dir)[8])
# shutil.copy(image_path, '/kaggle/working')
frame = mp.Image.create_from_file(image_path)
# mp_image = Image(image_format=ImageFormat.SRGB, data=image)

# pose_model = vision.PoseLandmarker.create_from_options(pose_options)
# pose_res = pose_model.detect(frame)
# print(dir(pose_res))
# print(len(pose_res.pose_landmarks[0]))

# face_model = vision.FaceLandmarker.create_from_options(face_options)
# face_res = face_model.detect(frame)
# print(dir(face_res))
# print(face_res.facial_transformation_matrixes)
# print(len(face_res.face_landmarks[0]))

hands_model = vision.HandLandmarker.create_from_options(hands_options)
hands_res = hands_model.detect_for_video(frame, 0)
print(dir(hands_res))
print(hands_res.handedness[0])
print(len(hands_res.hand_landmarks[0]))

# annotated_image = draw_landmarks_on_image(frame.numpy_view(), hands_res)
# annotated_image = draw_pose_andmarks_on_image(frame.numpy_view(), pose_res)
# cv2.imwrite("drawn.jpg", cv2.cvtColor(annotated_image, cv2.COLOR_RGB2BGR))

['__annotations__', '__class__', '__dataclass_fields__', '__dataclass_params__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getstate__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__match_args__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', 'hand_landmarks', 'hand_world_landmarks', 'handedness']
[Category(index=1, score=0.9807953238487244, display_name='Left', category_name='Left')]
21


I0000 00:00:1754371230.826115      36 task_runner.cc:85] GPU suport is not available: INTERNAL: ; RET_CHECK failure (mediapipe/gpu/gl_context_egl.cc:84) egl_initializedUnable to initialize EGL
W0000 00:00:1754371230.868143     819 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1754371230.884169     819 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.


In [199]:
def extract_frame_keypoints(frame_path, timestamp, pose_model, face_model, hands_model):
    # TODO: normalize(?) keypoints after adjustment

    # define numpy views, pose -> face -> rh -> lh
    all_kps = np.zeros((184, 3))  # (pose=6 + face=136 + rh+lh=42), xyz=3
    pose_kps = all_kps[KP2SLICE["pose"]]
    face_kps = all_kps[KP2SLICE["face"]]
    rh_kps = all_kps[KP2SLICE["rh"]]
    lh_kps = all_kps[KP2SLICE["lh"]]
    np_xyz = np.dtype((float, 3))

    frame = mp.Image.create_from_file(frame_path)
    
    def get_pose():
        nonlocal pose_kps
        results = pose_model.detect_for_video(frame, timestamp)
        if results.pose_landmarks is None:
            return

        lms = results.pose_landmarks[0]
        pose_kps[:] = np.fromiter(((lms[idx].x, lms[idx].y, lms[idx].z) for idx in pose_kps_idx), dtype=np_xyz)
        # pose_kps -= pose_kps[mp_pose_nose_idx]

    def get_face():
        nonlocal face_kps
        results = face_model.detect_for_video(frame, timestamp)
        if results.face_landmarks is None:
            return

        lms = results.face_landmarks[0]
        face_kps[:] = np.fromiter(((lms[idx].x, lms[idx].y, lms[idx].z) for idx in face_kps_idx), dtype=np_xyz)
        # face_kps -= face_kps[mp_face_nose_idx]

    def get_hands():
        nonlocal rh_kps, lh_kps
        results = hands_model.detect_for_video(frame, timestamp)
        if results.hand_landmarks is None:
            return

        for handedness, hand_lms in zip(hands_res.handedness, hands_res.hand_landmarks):
            target_hand = lh_kps if handedness[0].category_name == 'Left' else rh_kps
            target_hand[:] = np.fromiter(((lm.x, lm.y, lm.z) for lm in hand_lms), dtype=np_xyz)
            # target_hand -= target_hand[mp_face_nose_idx]

    with ThreadPoolExecutor(max_workers=3) as executor:
        executor.submit(get_pose)
        executor.submit(get_face)
        executor.submit(get_hands)
    
    return all_kps


def process_video(video_dir):
    video_dir = os_join(*video_dir)

    video_kps = []
    pose_model = vision.PoseLandmarker.create_from_options(mp_pose_options)
    face_model = vision.FaceLandmarker.create_from_options(mp_face_options)
    hands_model = vision.HandLandmarker.create_from_options(mp_hands_options)
    with pose_model, face_model, hands_model:
        for idx, frame in enumerate(sorted(os.listdir(video_dir))):
            frame_path = os_join(video_dir, frame)
            timestamp = int(idx * MS_30FPS)
            video_kps.append(
                extract_frame_keypoints(frame_path, timestamp, pose_model, face_model, hands_model)
            )

    return np.array(video_kps)


def store_keypoint_arrays(word_dir, out_dir, split, signer, word, max_videos):
    # all_kps = []
    # videos_bar = tqdm(os.listdir(word_dir)[:max_videos], leave=False)
    # for video in videos_bar:
    #     videos_bar.set_description(f"Current video: {video}")
    #     result = process_video(word_dir, video)
    #     print(result.shape)
    #     all_kps.append(result)

    video_dirs = [(word_dir, video) for video in os.listdir(word_dir)]
    with ProcessPoolExecutor(max_workers=2) as executor:
        results = list(
            tqdm(
                executor.map(process_video, video_dirs),
                total=len(video_dirs),
                desc=f"Processing Videos for '{word}'",
                leave=False
            )
        )
    
    all_kps = [kps for kps in results if kps is not None]


    # word_kps_path = os_join(out_dir, "all_kps", f"{signer}-{split}", word)
    # os.makedirs(os.path.dirname(word_kps_path), exist_ok=True)
    # np.savez(word_kps_path, keypoints=np.concatenate(all_kps, axis=0))

In [200]:
def extract_keypoints_from_frames(data_dir, kps_dir, splits=None, signers=None, selected_words=None):
    splits = splits or ["train", "test"]
    signers = signers or ["01", "02", "03"]
    selected_words = selected_words or tuple((f"{v:04}" for v in range(46, 503)))
    words_bar = tqdm(selected_words)
    for word in words_bar:
        words_bar.set_description(f"Current word: {word}")
        signers_bar = tqdm(signers, leave=False)
        for signer in signers:
            signers_bar.set_description(f"Current signer: {signer}")
            splits_bar = tqdm(splits, leave=False)
            for split in splits:
                splits_bar.set_description(f"Current split: {split}")
                word_dir = os_join(data_dir, signer, signer, split, word)
                store_keypoint_arrays(word_dir, kps_dir, split, signer, word, max_videos=10)
    
extract_keypoints_from_frames(DATA_DIR, KPS_DIR)

  0%|          | 0/457 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

I0000 00:00:1754373059.991072      36 task_runner.cc:85] GPU suport is not available: INTERNAL: ; RET_CHECK failure (mediapipe/gpu/gl_context_egl.cc:84) egl_initializedUnable to initialize EGL
I0000 00:00:1754373059.991072      36 task_runner.cc:85] GPU suport is not available: INTERNAL: ; RET_CHECK failure (mediapipe/gpu/gl_context_egl.cc:84) egl_initializedUnable to initialize EGL


Processing Videos for '0046':   0%|          | 0/42 [00:00<?, ?it/s]

BrokenProcessPool: A process in the process pool was terminated abruptly while the future was running or pending.

In [204]:
os.makedirs(os.path.dirname(word_kps_path), exist_ok=True)

NameError: name 'word_kps_path' is not defined

In [None]:
!python prepare_kps.py

2025-08-05 06:02:21.725683: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1754373741.752148    5649 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1754373741.760238    5649 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
475 476
477 474
469 470
Current word: 0046:   0%|                               | 0/457 [00:00<?, ?it/s]
  0%|                                                     | 0/3 [00:00<?, ?it/s][A
Current signer: 01:   0%|                                 | 0/3 [00:00<?, ?it/s][A

  0%|                                                     | 0/2 [00:00<?, ?it/s][A[A

Current split: train:   0%|                               | 0/2 [00:00<?, ?

In [None]:
def load_keypoints(kps_dir, f_avg, split, words=None, signers=None):
    def pad_seq_(x, padding_amount):
        x = np.concatenate((x, np.repeat(x[-1], padding_amount, axis=0)), axis=0)

    signers = signers or ["01", "02", "03"]
    words = words or tuple((f"{v:04}" for v in range(1, 503)))

    kps_data_path = os_join(kps_dir, "all_kps")
    sequences = []
    for word in tqdm(words[:1]):
        for signer in signers:
            word_dir = os_join(kps_data_path, f"{signer}-{split}", word)
            sequences.append(
                [np.load(os_join(word_dir, video)) for video in os.listdir(word_dir)]
            )
    return sequences
    X = np.array(sequences)
    y = np.array([label_map[word] for word in words])
    y = OneHotEncoder(sparse=False).fit_transform(y.reshape(-1, 1))

    return X, y

# X, y = load_keypoints(KPS_DIR, SEQ_LEN, "train")
seq = load_keypoints(KPS_DIR, SEQ_LEN, "train")

In [None]:
len(seq), len(seq[0]), len(seq[1]), len(seq[2]), seq[0][0].shape, seq[1][0].shape, seq[2][0].shape

In [None]:
np.concatenate(seq, axis=1).shape

In [None]:
!tar -cf all-kps /kaggle/working/karsl-kps/all_kps