**IMPORT DEPENDENCIES**

In [1]:
import os
import cv2
import numpy as np
import mediapipe as mp
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from keras.utils import to_categorical
from keras.layers import Input, Dense, Dropout, Flatten, Conv1D, MaxPooling1D
from keras.callbacks import TensorBoard
from keras.models import Sequential

IMAGE_PATH = os.path.join("Images")
DATA_PATH = os.path.join("Data/static")
SAVE_DIR = os.path.join("result/static")

**DEFINE KEY FUNCTIONS**

In [None]:
mp_hands = mp.solutions.hands
mp_drawing = mp.solutions.drawing_utils
mp_drawing_styles = mp.solutions.drawing_styles

def mediapipe_detection(image, model):
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    image.flags.writeable = False
    results = model.process(image)
    image.flags.writeable = True
    image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
    return image, results

def draw_landmarks(image, results):
    for hand_landmarks in results.multi_hand_landmarks:
        mp_drawing.draw_landmarks(
            image,
            hand_landmarks,
            mp_hands.HAND_CONNECTIONS,
            mp_drawing_styles.get_default_hand_landmarks_style(),
            mp_drawing_styles.get_default_hand_connections_style())
    
def extract_keypoints(image, results):
    if results.multi_hand_landmarks:
        hand_landmarks = results.multi_hand_landmarks[0]
        return np.array([[res.x, res.y, res.z] for res in hand_landmarks.landmark]) if hand_landmarks.landmark else np.zeros((21,3))

**EXTRACT KEYPOINTS USING MEDIAPIPE HANDS**

In [None]:
with mp_hands.Hands(static_image_mode=True, min_detection_confidence=0.7, max_num_hands=1) as hands:
    path = os.path.join(IMAGE_PATH)
    for dir in os.listdir(path):
        dir_path = os.path.join(path, dir)
        for file in tqdm(os.listdir(dir_path)):
            
            image_path = os.path.join(dir_path, file)
            image = cv2.imread(image_path)

            if image is None:
                continue

            image, results = mediapipe_detection(image, hands)
            keypoints = extract_keypoints(image, results)
            
            if keypoints:
                os.makedirs(os.path.join(DATA_PATH, dir), exist_ok=True)
                save_path = os.path.join(DATA_PATH, dir, file.rsplit(".",1)[0] + ".npy")
                np.save(save_path, keypoints)

        print("Processed directory:", dir)

**LABEL DATA FOR TRAINING**

In [None]:
alphabets = np.array(os.listdir(DATA_PATH))
label_map = {label: num for num, label in enumerate(alphabets)}

data, labels = [], []
  
for label in tqdm(alphabets):
    path = os.path.join(DATA_PATH, label, "npy")
    for file in os.listdir(path):
        file_path = os.path.join(path, file)
        try:
            keypoints = np.load(file_path)
            data.append(keypoints)
            labels.append(label_map[label])
        except Exception as e:
            print(f"Skipping file {file_path} due to error: {e}")

# Convert to arrays and one-hot encode labels
X = np.array(data)
y = to_categorical(np.array(labels), num_classes=len(alphabets))

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

 46%|████▌     | 11/24 [00:22<00:30,  2.34s/it]

**SAVE TRAIN DATA**

In [None]:
os.makedirs(SAVE_DIR, exist_ok=True)
np.save(f"{SAVE_DIR}/X_train.npy", X_train)
np.save(f"{SAVE_DIR}/X_test.npy", X_test)
np.save(f"{SAVE_DIR}/y_train.npy", y_train)
np.save(f"{SAVE_DIR}/y_test.npy", y_test)

**LOAD TRAIN DATA**

In [None]:
X_train = np.load(f"{SAVE_DIR}/X_train.npy")
X_test = np.load(f"{SAVE_DIR}/X_test.npy")
y_train = np.load(f"{SAVE_DIR}/y_train.npy")
y_test = np.load(f"{SAVE_DIR}/y_test.npy")

**TRAIN MODEL**

In [None]:
log_dir = os.path.join("logs")
tb_callback = TensorBoard(log_dir=log_dir)
model = Sequential([
    Input(shape=(21, 3)), # 21 keypoints of hand x 3 coordinates (x,y,z)
    Conv1D(64, 3, activation='relu'),
    MaxPooling1D(2),
    Conv1D(128, 3, activation='relu'),
    MaxPooling1D(2),
    Flatten(),
    Dense(128, activation='relu'),
    Dropout(0.3),
    Dense(64, activation='relu'),
    Dropout(0.3),
    Dense(12, activation='softmax') # Change the number to match the number of class
])

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()

**BUILD MODEL**

In [None]:
model.fit(X_train, y_train, epochs=30, callbacks=[tb_callback])

**SAVE MODEL**

In [None]:
model.save("models/static.h5")

**TEST STATIC MODEL**

In [None]:
from keras.models import load_model
model = load_model("models/static.h5")
threshold = 0.8

alphabets = np.array(os.listdir(DATA_PATH))

cap = cv2.VideoCapture(0)
with mp_hands.Hands(max_num_hands=2, min_detection_confidence=0.5, min_tracking_confidence=0.5) as hands:
    while cap.isOpened():
        ret, frame = cap.read()

        image, results = mediapipe_detection(frame, hands)
        
        if results.multi_hand_landmarks:
            draw_landmarks(image, results)
            keypoints = extract_keypoints(image, results)
            if keypoints is not None and keypoints.size > 0:
                keypoints = keypoints.reshape(21, 3)
                prediction = model.predict(np.expand_dims(keypoints, axis=0), verbose=0)
                class_id = np.argmax(prediction)
                confidence = np.max(prediction)

                if confidence > threshold:
                    label = alphabets[class_id]
                    cv2.putText(image, f'{label} {int(confidence * 100)}%', (10, 70),
                                cv2.FONT_HERSHEY_SIMPLEX, 2, (255, 0, 0), 3)

        cv2.imshow('ASL Recognition', image)
        if cv2.waitKey(10) & 0xFF == ord('q'):
            break
    cap.release()
    cv2.destroyAllWindows()

**TEST DYNAMIC MODEL**

In [None]:
from keras.models import load_model
DATA_PATH = "./Data/dynamic"

# Load trained model
model = load_model("models/dynamic.h5")
threshold = 0.8

# Class labels
actions = np.array(os.listdir(DATA_PATH))

# Frame buffer
sequence = []
SEQ_LENGTH = 30

cap = cv2.VideoCapture(0)
with mp_hands.Hands(max_num_hands=2, min_detection_confidence=0.5, min_tracking_confidence=0.5) as hands:
    start_recording = False
    countdown = 3
    countdown_start_time = None
    while cap.isOpened():
        ret, frame = cap.read()

        image, results = mediapipe_detection(frame, hands)

        if not start_recording:
            if countdown_start_time is None:
                countdown_start_time = cv2.getTickCount()

            elapsed_time = (cv2.getTickCount() - countdown_start_time) / cv2.getTickFrequency()

            # Hiển thị số đếm ngược
            if elapsed_time < countdown:
                number = countdown - int(elapsed_time)
                cv2.putText(image, f"{number}", (250, 200),
                            cv2.FONT_HERSHEY_SIMPLEX, 4, (0, 0, 255), 6)
            else:
                start_recording = True  # hết countdown thì bắt đầu
                sequence = [] 
        else:
            # Khi đã start thì xử lý Mediapipe keypoints
            if results.multi_hand_landmarks:
                draw_landmarks(image, results)
                keypoints = extract_keypoints(image, results)

                if keypoints is not None and keypoints.size > 0:
                    keypoints = keypoints.flatten()
                    sequence.append(keypoints)

                    if len(sequence) > SEQ_LENGTH:
                        sequence.pop(0)

                    # Hiển thị visual cue: khung xanh + chữ "Recording..."
                    h, w, _ = image.shape
                    cv2.rectangle(image, (10, 10), (w-10, h-10), (0, 255, 0), 6)
                    cv2.putText(image, "Recording...", (50, 60),
                                cv2.FONT_HERSHEY_SIMPLEX, 1.5, (0, 255, 0), 3)

                    if len(sequence) == SEQ_LENGTH:
                        input_data = np.expand_dims(sequence, axis=0)  # (1,30,63)
                        prediction = model.predict(input_data, verbose=0)
                        class_id = np.argmax(prediction)
                        confidence = np.max(prediction)

                        if confidence > threshold:
                            label = actions[class_id]
                            cv2.putText(image, f'{label} {int(confidence * 100)}%',
                                        (10, 120), cv2.FONT_HERSHEY_SIMPLEX,
                                        2, (255, 0, 0), 3)

        cv2.imshow('ASL Recognition', image)
        if cv2.waitKey(10) & 0xFF == ord('q'):
            break
    cap.release()
    cv2.destroyAllWindows()