In [2]:
from pydrive2.auth import GoogleAuth
from pydrive2.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
from kaggle_secrets import UserSecretsClient

user_secrets = UserSecretsClient()
gdrive_kaggle_wl_ar_sl = user_secrets.get_secret("gdrive_kaggle_wl_ar_sl")

auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

def upload_to_gdrive(file_name, file_content):
    file_metadata = {
        'title': file_name,
        'parents': [{'id': gdrive_kaggle_wl_ar_sl}]
    }
    file = drive.CreateFile(file_metadata)
    file.SetContentString(file_content)
    file.Upload() # Files.insert()

In [1]:
!pip install opencv-python mediapipe sklearn matplotlib
!wget https://github.com/issamjebnouni/Arabic-Word-level-Sign-Language-Recognition/raw/refs/heads/main/KARSL-502_Labels.xlsx

Collecting mediapipe
  Downloading mediapipe-0.10.21-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (9.7 kB)
Collecting protobuf<5,>=4.25.3 (from mediapipe)
  Downloading protobuf-4.25.8-cp37-abi3-manylinux2014_x86_64.whl.metadata (541 bytes)
Collecting sounddevice>=0.4.4 (from mediapipe)
  Downloading sounddevice-0.5.2-py3-none-any.whl.metadata (1.6 kB)
Downloading mediapipe-0.10.21-cp311-cp311-manylinux_2_28_x86_64.whl (35.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m35.6/35.6 MB[0m [31m51.1 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hDownloading protobuf-4.25.8-cp37-abi3-manylinux2014_x86_64.whl (294 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m294.9/294.9 kB[0m [31m12.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading sounddevice-0.5.2-py3-none-any.whl (32 kB)
Installing collected packages: protobuf, sounddevice, mediapipe
  Attempting uninstall: protobuf
    Found existing installation: protobuf 3.20.3
    Uninstalling prot

In [129]:
import os
import cv2
import numpy as np
import mediapipe as mp
from tqdm.notebook import tqdm
from collections import defaultdict
from concurrent.futures import ThreadPoolExecutor

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' # '2' suppresses warnings and info messages
os_join = os.path.join

DATA_DIR = "/kaggle/input/karsl-502"
KPS_DIR = "/kaggle/working/karsl-kps"

mp_holistic = mp.solutions.holistic.Holistic
mp_face_nose_idx = sorted(mp.solutions.face_mesh_connections.FACEMESH_NOSE)[0][0]
mp_hand_wrist_idx = mp.solutions.hands.HandLandmark.WRIST
mp_pose_nose_idx = mp.solutions.pose.PoseLandmark.NOSE

pose_kps_idx = tuple(
    (
        mp.solutions.pose.PoseLandmark.LEFT_SHOULDER,
        mp.solutions.pose.PoseLandmark.RIGHT_SHOULDER,
        mp.solutions.pose.PoseLandmark.LEFT_ELBOW,
        mp.solutions.pose.PoseLandmark.RIGHT_ELBOW,
        mp.solutions.pose.PoseLandmark.LEFT_WRIST,
        mp.solutions.pose.PoseLandmark.RIGHT_WRIST,
    )
)
face_kps_idx = tuple(
    sorted(
        set(
            point
            for edge in [
                *mp.solutions.face_mesh_connections.FACEMESH_CONTOURS,
                *mp.solutions.face_mesh_connections.FACEMESH_IRISES,
            ]
            for point in edge
        )
    )
)
hand_kps_idx = tuple(range(len(mp.solutions.hands.HandLandmark)))

POSE_NUM = len(pose_kps_idx)
FACE_NUM = len(face_kps_idx)
HAND_NUM = len(hand_kps_idx)

KP2SLICE = {
    "pose": slice(0, POSE_NUM),
    "face": slice(POSE_NUM, POSE_NUM + FACE_NUM),
    "rh": slice(POSE_NUM + FACE_NUM, POSE_NUM + FACE_NUM + HAND_NUM),
    "lh": slice(POSE_NUM + FACE_NUM + HAND_NUM, POSE_NUM + FACE_NUM + HAND_NUM * 2),
}
POSE_KPS2IDX = {kps: idx for idx, kps in enumerate(pose_kps_idx)}
FACE_KPS2IDX = {kps: idx for idx, kps in enumerate(face_kps_idx)}
HAND_KPS2IDX = {kps: idx for idx, kps in enumerate(hand_kps_idx)}
KPS2IDX = {"pose": POSE_KPS2IDX, "face": FACE_KPS2IDX, "hand": HAND_KPS2IDX}


# usage: use it to draw mediapipe connections with the kps loaded from `.npy`arrays
for u, v in list(mp.solutions.face_mesh_connections.FACEMESH_IRISES)[:3]:
    print(face_kps_idx[FACE_KPS2IDX[u]], face_kps_idx[FACE_KPS2IDX[v]])


475 476
477 474
469 470


In [16]:
def get_karsl_words_min_frames_cnt():
    in_dir = "/kaggle/input/karsl-502"
    words_frames = defaultdict(lambda: (0, None))
    for signer in tqdm(["01", "02", "03"], desc="signer"):
        signer_dir = os_join(in_dir, signer, signer)

        for split in tqdm(["train", "test"], desc="split", leave=False):
            split_dir = os_join(signer_dir, split)

            for word in tqdm(range(1, 503), desc="words", leave=False):
                frames = (999, None)
                word_dir = os_join(split_dir, f"{word:04}")

                for rep in os.listdir(word_dir):
                    frames_dir = os_join(word_dir, rep)
                    frames_cnt = len(os.listdir(frames_dir))
                    if frames_cnt < frames[0]:
                        frames = (frames_cnt, frames_dir)

                if frames[0] > words_frames[word][0]:
                    words_frames[word] = frames
    return words_frames


# words_frames = get_karsl_words_min_frames_cnt()

In [None]:
# !tar -cf sample.tar.gz '/kaggle/input/karsl-502/03/03/test/0102/03_03_0102_(22_12_16_10_40_19)_c'
# sorted(words_frames.values())

In [27]:
bad_samples = [
    # this sample has >260 frames, and after inspection it has many unrelated frames, so just drop it
    'karsl-502/02/02/train/0443/03_02_0443_(15_11_17_15_52_07)_c',
]

PAD_TKN = -1
SEQ_LEN = 80

In [159]:
image_path = '/kaggle/input/karsl-502/01/01/train/0001/01_01_0001_(10_11_16_16_21_34)_c/01_01_0001_(10_11_16_16_21_34)_c_0019.jpg'
np_xyz = np.dtype((float, 3))
holistic = mp_holistic(
    refine_face_landmarks=True,
    min_detection_confidence=0.5,
    min_tracking_confidence=0.5
)

with holistic:
    frame = cv2.imread(image_path)
    results = mediapipe_detection(frame, holistic)
    print(results.right_hand)
    # all_kps = np.zeros((184, 3))  # (pose=6 + face=136 + rh+lh=42), xyz=3
    # pose_kps = all_kps[KP2SLICE["pose"]]
    # lms = results.pose_landmarks.landmark
    # pose_kps[:] = np.fromiter((get_xyz(lms[idx]) for idx in pose_kps_idx), dtype=np.dtype((float, 3)))
    # pose_kps[:] = np.array([get_xyz(lms[idx]) for idx in pose_kps_idx])
    # print(pose_kps)
    # pose_kps -= pose_kps[mp_pose_nose_idx]
    # face_kps = all_kps[KP2SLICE["face"]]
    # lms = results.face_landmarks.landmark
    # face_kps[:] = np.fromiter(((lms[idx].x, lms[idx].y, lms[idx].z) for idx in face_kps_idx), dtype=np_xyz)

    print(all_kps)

[x: 0.633573234
y: 0.564548492
z: -2.94025199e-07
, x: 0.605434656
y: 0.539195538
z: -0.00510149542
, x: 0.586323142
y: 0.501541257
z: -0.00866374373
, x: 0.583071291
y: 0.47038281
z: -0.0143345529
, x: 0.59221828
y: 0.448929936
z: -0.019237956
, x: 0.611100197
y: 0.4539074
z: 0.00591755146
, x: 0.613425255
y: 0.428992122
z: -0.00776894158
, x: 0.608940721
y: 0.419359416
z: -0.0183149651
, x: 0.601188302
y: 0.419207722
z: -0.0238378327
, x: 0.63553232
y: 0.455667049
z: 0.000891017844
, x: 0.626334727
y: 0.44703728
z: -0.0171043593
, x: 0.618685305
y: 0.477362841
z: -0.0222531948
, x: 0.612121642
y: 0.50435853
z: -0.0203166064
, x: 0.657861412
y: 0.465881765
z: -0.0065761595
, x: 0.644676268
y: 0.466106534
z: -0.0239591356
, x: 0.635433078
y: 0.493999124
z: -0.0208153557
, x: 0.628117144
y: 0.517374635
z: -0.0131998397
, x: 0.678627908
y: 0.482518584
z: -0.0148123801
, x: 0.664678156
y: 0.481347084
z: -0.0250919051
, x: 0.652446926
y: 0.501254857
z: -0.0206172019
, x: 0.644379
y: 0.5186

W0000 00:00:1754326004.131335   13810 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1754326004.158732   13810 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1754326004.163126   13808 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1754326004.163293   13810 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1754326004.163118   13809 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1754326004.179452   13808 inference_feedback_manager.cc:114] Feedback manager 

In [162]:
def extract_frame_keypoints(results):
    # TODO: normalize(?) keypoints after adjustment

    def get_xyz(lm):
        return (lm.x, lm.y, lm.z)

    # define numpy views, pose -> face -> rh -> lh
    all_kps = np.zeros((184, 3))  # (pose=6 + face=136 + rh+lh=42), xyz=3
    pose_kps = all_kps[KP2SLICE["pose"]]
    face_kps = all_kps[KP2SLICE["face"]]
    rh_kps = all_kps[KP2SLICE["rh"]]
    lh_kps = all_kps[KP2SLICE["lh"]]
    np_xyz = np.dtype((float, 3))
    
    def get_pose():
        if results.pose_landmarks is None:
            return
        nonlocal pose_kps
        lms = results.pose_landmarks.landmark
        pose_kps[:] = np.fromiter(((lms[idx].x, lms[idx].y, lms[idx].z) for idx in pose_kps_idx), dtype=np_xyz)
        # pose_kps -= pose_kps[mp_pose_nose_idx]

    def get_face():
        if results.face_landmarks is None:
            return
        nonlocal face_kps
        lms = results.face_landmarks.landmark
        face_kps[:] = np.fromiter(((lms[idx].x, lms[idx].y, lms[idx].z) for idx in face_kps_idx), dtype=np_xyz)
        # face_kps -= face_kps[mp_face_nose_idx]

    def get_rh():
        if results.right_hand_landmarks is None:
            return
        nonlocal rh_kps
        lms = results.right_hand_landmarks.landmark
        rh_kps[:] = np.fromiter(((lm.x, lm.y, lm.z) for lm in lms), dtype=np_xyz)
        # rh_kps -= rh_kps[mp_hand_wrist_idx]

    def get_lh():
        if results.left_hand_landmarks is None:
            return
        nonlocal lh_kps
        lms = results.left_hand_landmarks.landmark
        lh_kps[:] = np.fromiter(((lm.x, lm.y, lm.z) for lm in lms), dtype=np_xyz)
        # lh_kps -= lh_kps[mp_hand_wrist_idx]

    # with ThreadPoolExecutor(max_workers=4) as executor:
    #     executor.submit(get_pose)
    #     executor.submit(get_face)
    #     executor.submit(get_rh)
    #     executor.submit(get_lh)

    get_pose()
    get_face()
    get_rh()
    get_lh()
    
    return all_kps


def mediapipe_detection(image, model):
    # TODO: Normalize pixel values to the range [0, 1]
    image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    return model.process(image_rgb)


def store_keypoint_arrays(data_dir, out_dir, signer, split, selected_words=None):
    """This function generates numpy arrays of keypoints for each video in the specified folder location.
    Args:
      signer(int): the signer of interest. Could be 01 or 02 or 03
      split(str): can be 'train', 'test' or 'val'
    """
    selected_words = selected_words or [f"{w:04}" for w in range(1, 503)]
    # out_dir = os_join(out_dir, "karsl-502", signer, split)
    # os.makedirs(out_dir, exist_ok=True)

    split_dir = os_join(data_dir, signer, signer, split)
    words_bar = tqdm(selected_words[1:2], desc=split_dir)
    for word in words_bar:
        word_kps_dir = os_join(out_dir, "all_kps", f"{signer}-{split}", word)
        words_bar.set_description(f"Current iteration: {word_kps_dir}")
        os.makedirs(word_kps_dir, exist_ok=True)

        word_dir = os_join(split_dir, word)
        videos = os.listdir(word_dir)
        videos_bar = tqdm(videos, leave=False)
        for video in videos_bar:
            video_dir = os_join(word_dir, video)
            videos_bar.set_description(f"Current video: {video_dir}")
            video_frames = sorted(os.listdir(video_dir))

            video_kps_dir = os_join(word_kps_dir, video)

            all_kps = []
            with mp_holistic(
                refine_face_landmarks=True,
                min_detection_confidence=0.5,
                min_tracking_confidence=0.5
            ) as holistic:
                for frame in video_frames:
                    frame = cv2.imread(os_join(video_dir, frame))
                    all_kps.append(
                        extract_frame_keypoints(mediapipe_detection(frame, holistic))
                    )

            np.save(video_kps_dir, all_kps)
        break

In [163]:
def extract_keypoints_from_frames(data_dir, kps_dir, signers=None, splits=None):
    if signers is None:
        signers = ["01", "02", "03"]
    if splits is None:
        splits = ["train", "test"]
    for signer in signers:
        for split in splits:
            store_keypoint_arrays(data_dir, kps_dir, signer, split)

extract_keypoints_from_frames(DATA_DIR, KPS_DIR)

/kaggle/input/karsl-502/01/01/train:   0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/42 [00:00<?, ?it/s]

W0000 00:00:1754326135.182584   13877 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1754326135.206845   13877 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1754326135.210530   13876 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1754326135.210823   13877 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1754326135.210945   13878 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1754326135.228097   13878 inference_feedback_manager.cc:114] Feedback manager 

/kaggle/input/karsl-502/02/02/train:   0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/42 [00:00<?, ?it/s]

W0000 00:00:1754326199.647825   14078 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1754326199.672200   14078 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1754326199.675838   14078 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1754326199.675913   14076 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1754326199.676039   14077 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1754326199.691009   14077 inference_feedback_manager.cc:114] Feedback manager 

/kaggle/input/karsl-502/02/02/test:   0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

W0000 00:00:1754326247.141048   14247 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1754326247.168580   14245 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1754326247.172352   14245 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1754326247.172364   14247 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1754326247.172927   14244 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1754326247.191232   14245 inference_feedback_manager.cc:114] Feedback manager 

/kaggle/input/karsl-502/03/03/train:   0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/42 [00:00<?, ?it/s]

W0000 00:00:1754326256.010706   14277 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1754326256.035306   14276 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1754326256.037920   14278 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1754326256.039067   14276 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1754326256.040907   14277 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1754326256.054098   14276 inference_feedback_manager.cc:114] Feedback manager 

/kaggle/input/karsl-502/03/03/test:   0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

W0000 00:00:1754326319.586349   14445 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1754326319.611526   14445 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1754326319.615409   14446 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1754326319.615404   14447 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1754326319.615501   14445 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1754326319.634396   14446 inference_feedback_manager.cc:114] Feedback manager 

In [36]:
def load_keypoints(kps_dir, f_avg, split, words=None, signers=None):
    def pad_seq_(x, padding_amount):
        x = np.concatenate((x, np.repeat(x[-1], padding_amount, axis=0)), axis=0)

    signers = signers or ["01", "02", "03"]
    words = words or tuple((f"{v:04}" for v in range(1, 503)))

    kps_data_path = os_join(kps_dir, "all_kps")
    sequences = []
    for word in tqdm(words[:1]):
        for signer in signers:
            word_dir = os_join(kps_data_path, f"{signer}-{split}", word)
            sequences.append(
                [np.load(os_join(word_dir, video)) for video in os.listdir(word_dir)]
            )
    return sequences
    X = np.array(sequences)
    y = np.array([label_map[word] for word in words])
    y = OneHotEncoder(sparse=False).fit_transform(y.reshape(-1, 1))

    return X, y

# X, y = load_keypoints(KPS_DIR, SEQ_LEN, "train")
seq = load_keypoints(KPS_DIR, SEQ_LEN, "train")

  0%|          | 0/1 [00:00<?, ?it/s]

In [48]:
len(seq), len(seq[0]), len(seq[1]), len(seq[2]), seq[0][0].shape, seq[1][0].shape, seq[2][0].shape

(3, 1, 1, 1, (19, 184, 3), (23, 184, 3), (25, 184, 3))

In [76]:
np.concatenate(seq[0:3], axis=1).shape

(1, 67, 184, 3)