In [17]:
import cv2
import numpy as np
import os
from matplotlib import pyplot as plt
import time
import mediapipe as mp
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.callbacks import TensorBoard
from sklearn.metrics import multilabel_confusion_matrix, accuracy_score

In [10]:

mp_holistic = mp.solutions.holistic 
mp_drawing = mp.solutions.drawing_utils 

# Khởi tạo Mô hình
mp_holistic = mp.solutions.holistic
holistic_model = mp_holistic.Holistic(
    min_detection_confidence=0.5,
    min_tracking_confidence=0.5
)
mp_drawing = mp.solutions.drawing_utils

def mediapipe_detection(image, model):
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) 
    image.flags.writeable = False                  
    results = model.process(image)               
    image.flags.writeable = True                  
    image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
    return image, results

def draw_landmarks(image, results):

    mp_drawing.draw_landmarks(image, results.face_landmarks, mp_holistic.FACEMESH_CONTOURS) 
    #mp_drawing.draw_landmarks(image, results.face_landmarks, mp_holistic.FACE_CONNECTIONS) # Draw face connections
    mp_drawing.draw_landmarks(image, results.pose_landmarks, mp_holistic.POSE_CONNECTIONS)
    mp_drawing.draw_landmarks(image, results.left_hand_landmarks, mp_holistic.HAND_CONNECTIONS) 
    mp_drawing.draw_landmarks(image, results.right_hand_landmarks, mp_holistic.HAND_CONNECTIONS) 

def draw_styled_landmarks(image, results):
    # Draw face connections
    mp_drawing.draw_landmarks(image, results.face_landmarks, mp_holistic.FACEMESH_CONTOURS, 
                             mp_drawing.DrawingSpec(color=(80,110,10), thickness=1, circle_radius=1), 
                             mp_drawing.DrawingSpec(color=(80,256,121), thickness=1, circle_radius=1)
                             ) 
    # Draw pose connections
    mp_drawing.draw_landmarks(image, results.pose_landmarks, mp_holistic.POSE_CONNECTIONS,
                             mp_drawing.DrawingSpec(color=(80,22,10), thickness=2, circle_radius=4), 
                             mp_drawing.DrawingSpec(color=(80,44,121), thickness=2, circle_radius=2)
                             ) 
    # Draw left hand connections
    mp_drawing.draw_landmarks(image, results.left_hand_landmarks, mp_holistic.HAND_CONNECTIONS, 
                             mp_drawing.DrawingSpec(color=(121,22,76), thickness=2, circle_radius=4), 
                             mp_drawing.DrawingSpec(color=(121,44,250), thickness=2, circle_radius=2)
                             ) 
    # Draw right hand connections  
    mp_drawing.draw_landmarks(image, results.right_hand_landmarks, mp_holistic.HAND_CONNECTIONS, 
                             mp_drawing.DrawingSpec(color=(245,117,66), thickness=2, circle_radius=4), 
                             mp_drawing.DrawingSpec(color=(245,66,230), thickness=2, circle_radius=2)
                             ) 
    

In [2]:
cap = cv2.VideoCapture(0)
# Thiết lập mô hình Mediapipe 
with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
    while cap.isOpened():
        ret, frame = cap.read()
        image, results = mediapipe_detection(frame, holistic)
        
        draw_styled_landmarks(image, results)
        cv2.imshow('Dữ liệu từ OpenCV', image)
        if cv2.waitKey(10) & 0xFF == ord('q'):
            break
    cap.release()
    cv2.destroyAllWindows()


In [3]:
pose = []
for res in results.pose_landmarks.landmark:
    test = np.array([res.x, res.y, res.z, res.visibility])
    pose.append(test)

pose = np.array([[res.x, res.y, res.z, res.visibility] for res in results.pose_landmarks.landmark]).flatten() if results.pose_landmarks else np.zeros(132)
face = np.array([[res.x, res.y, res.z] for res in results.face_landmarks.landmark]).flatten() if results.face_landmarks else np.zeros(1404)
lh = np.array([[res.x, res.y, res.z] for res in results.left_hand_landmarks.landmark]).flatten() if results.left_hand_landmarks else np.zeros(21*3)
rh = np.array([[res.x, res.y, res.z] for res in results.right_hand_landmarks.landmark]).flatten() if results.right_hand_landmarks else np.zeros(21*3)

def extract_keypoints(results):
    pose = np.array([[res.x, res.y, res.z, res.visibility] for res in results.pose_landmarks.landmark]).flatten() if results.pose_landmarks else np.zeros(33*4)
    face = np.array([[res.x, res.y, res.z] for res in results.face_landmarks.landmark]).flatten() if results.face_landmarks else np.zeros(468*3)
    lh = np.array([[res.x, res.y, res.z] for res in results.left_hand_landmarks.landmark]).flatten() if results.left_hand_landmarks else np.zeros(21*3)
    rh = np.array([[res.x, res.y, res.z] for res in results.right_hand_landmarks.landmark]).flatten() if results.right_hand_landmarks else np.zeros(21*3)
    return np.concatenate([pose, face, lh, rh])

In [4]:
def extract_keypoints(results):
    pose = (np.array([[res.x, res.y, res.z, res.visibility] for res in results.pose_landmarks.landmark]).flatten() 
            if results.pose_landmarks else np.zeros(33 * 4))
    face = (np.array([[res.x, res.y, res.z] for res in results.face_landmarks.landmark]).flatten() 
            if results.face_landmarks else np.zeros(468 * 3))
   
    lh = (np.array([[res.x, res.y, res.z] for res in results.left_hand_landmarks.landmark]).flatten() 
          if results.left_hand_landmarks else np.zeros(21 * 3))
    
    # Extract right hand landmarks and flatten, or use zeros if no landmarks found
    rh = (np.array([[res.x, res.y, res.z] for res in results.right_hand_landmarks.landmark]).flatten() 
          if results.right_hand_landmarks else np.zeros(21 * 3))
    
    # Concatenate all the extracted landmarks into a single array
    return np.concatenate([pose, face, lh, rh])


In [7]:
import os
import numpy as np
import cv2
from PIL import Image, ImageDraw, ImageFont

# Cấu hình đường dẫn và dữ liệu
DATA_PATH = os.path.join('model1vnmn')
actions = np.array(['Xin chào', 'Cô giáo', 'Cảm ơn', 'Đẹp gái'])
no_sequences = 30
sequence_length = 30

#Hàm hiển thị văn bản Unicode bằng Pillow
def put_text_pillow(image, text, position, font_path="arial.ttf", font_size=32, color=(255, 255, 255)):

    #Chuyển đổi hình ảnh OpenCV (numpy array) sang Pillow Image
    pil_image = Image.fromarray(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))

    #Tải font và vẽ văn bản
    draw = ImageDraw.Draw(pil_image)
    font = ImageFont.truetype(font_path, font_size)
    draw.text(position, text, font=font, fill=color)

    #Chuyển đổi lại từ Pillow Image về OpenCV (numpy array)
    return cv2.cvtColor(np.array(pil_image), cv2.COLOR_RGB2BGR)

    # Dùng Pillow để vẽ văn bản Unicode
    for idx, action in enumerate(actions):
        frame = put_text_pillow(
            frame,
            text=action,
            position=(50, 50 + idx * 50),  # Vị trí hiển thị mỗi dòng
            font_path="arial.ttf",  # Đường dẫn tới font hỗ trợ Unicode
            font_size=32,
            color=(0, 255, 0)  # Màu xanh lá
        )

cap.release()
cv2.destroyAllWindows()


In [None]:
def put_text_pillow_with_background(image, text, position, font_path="arial.ttf", font_size=32, 
                                    text_color=(255, 255, 255), bg_color=(0, 0, 0), padding=10):
    # Chuyển đổi hình ảnh OpenCV (numpy array) sang Pillow Image
    pil_image = Image.fromarray(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))

    # Tải font và vẽ văn bản
    draw = ImageDraw.Draw(pil_image)
    font = ImageFont.truetype(font_path, font_size)

    # Lấy kích thước văn bản
    text_width, text_height = draw.textsize(text, font=font)

    # Vẽ hình chữ nhật làm nền
    rect_x1 = position[0] - padding
    rect_y1 = position[1] - padding
    rect_x2 = position[0] + text_width + padding
    rect_y2 = position[1] + text_height + padding
    draw.rectangle([rect_x1, rect_y1, rect_x2, rect_y2], fill=bg_color)

    # Vẽ văn bản lên trên nền
    draw.text(position, text, font=font, fill=text_color)

    # Chuyển đổi lại từ Pillow Image về OpenCV
    return cv2.cvtColor(np.array(pil_image), cv2.COLOR_RGB2BGR)


In [9]:
label_map = {label:num for num, label in enumerate(actions)}
sequences, labels = [], []
for action in actions:
    for sequence in range(no_sequences):
        window = []
        for frame_num in range(sequence_length):
            res = np.load(os.path.join(DATA_PATH, action, str(sequence), "{}.npy".format(frame_num)))
            window.append(res)
        sequences.append(window)
        labels.append(label_map[action])



# LSTM Model

In [13]:
log_dir = os.path.join('Logs')
tb_callback = TensorBoard(log_dir=log_dir)
model = Sequential()
model.add(LSTM(64, return_sequences=True, activation='relu', input_shape=(30,1662)))
model.add(LSTM(128, return_sequences=True, activation='relu'))
model.add(LSTM(64, return_sequences=False, activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(actions.shape[0], activation='softmax'))
model.compile(optimizer='Adam', loss='categorical_crossentropy', metrics=['categorical_accuracy'])






# load model

In [14]:
model.load_weights('model1vnmn.h5')

# Test in Real Time

In [18]:
from PIL import Image, ImageDraw, ImageFont

colors = [(245, 117, 16), (117, 245, 16), (16, 117, 245)]

def prob_viz(res, actions, input_frame, colors, font_path="arial.ttf"):
    output_frame = input_frame.copy()
    for num, prob in enumerate(res):
        # Vẽ hình chữ nhật hiển thị xác suất
        cv2.rectangle(output_frame, (0, 60 + num * 40), (int(prob * 100), 90 + num * 40), colors[num], -1)
       
        pil_image = Image.fromarray(cv2.cvtColor(output_frame, cv2.COLOR_BGR2RGB))
        draw = ImageDraw.Draw(pil_image)
        font = ImageFont.truetype(font_path, 32)  # Thay đổi kích thước font nếu cần
        draw.text((10, 60 + num * 40), f"{actions[num]}: {prob:.2f}", font=font, fill=(255, 255, 255))
        
        # Chuyển đổi lại từ Pillow về OpenCV
        output_frame = cv2.cvtColor(np.array(pil_image), cv2.COLOR_RGB2BGR)
    return output_frame


In [19]:
# 1. New detection variables
sequence = []
sentence = []
threshold = 0.8

cap = cv2.VideoCapture(0)
# Set mediapipe model 
with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
    while cap.isOpened():

        # Read feed
        ret, frame = cap.read()

        # Make detections
        image, results = mediapipe_detection(frame, holistic)
        print(results)
        
        # Draw landmarks
        draw_styled_landmarks(image, results)
        
        # 2. Prediction logic
        keypoints = extract_keypoints(results)
#         sequence.insert(0,keypoints)
#         sequence = sequence[:30]
        sequence.append(keypoints)
        sequence = sequence[-30:]
        
        if len(sequence) == 30:
            res = model.predict(np.expand_dims(sequence, axis=0))[0]
            print(actions[np.argmax(res)])
            
            
        #3. Viz logic
            if res[np.argmax(res)] > threshold: 
                if len(sentence) > 0: 
                    if actions[np.argmax(res)] != sentence[-1]:
                        sentence.append(actions[np.argmax(res)])
                else:
                    sentence.append(actions[np.argmax(res)])

            if len(sentence) > 5: 
                sentence = sentence[-5:]

            # Viz probabilities
            image = prob_viz(res, actions, image, colors)
            
        cv2.rectangle(image, (0,0), (640, 40), (245, 117, 16), -1)
        cv2.putText(image, ' '.join(sentence), (3,30), 
                       cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2, cv2.LINE_AA)
        
        # Show to screen
        cv2.imshow('OpenCV Feed', image)

        # Break gracefully
        if cv2.waitKey(10) & 0xFF == ord('q'):
            break
    cap.release()
    cv2.destroyAllWindows()

<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.soluti

In [20]:
import cv2
import mediapipe as mp
import numpy as np
import pyttsx3

# Khởi tạo text-to-speech engine
engine = pyttsx3.init()
# Thiết lập tốc độ đọc
engine.setProperty('rate', 150)

# New detection variables
sequence = []
sentence = []
threshold = 0.8

# Set mediapipe model 
with mp.solutions.holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
    cap = cv2.VideoCapture(0)
    while cap.isOpened():
        # Read feed
        ret, frame = cap.read()

        # Make detections
        image, results = mediapipe_detection(frame, holistic)

        # Draw landmarks
        draw_styled_landmarks(image, results)

        # Prediction logic
        keypoints = extract_keypoints(results)
        sequence.append(keypoints)
        sequence = sequence[-30:]

        if len(sequence) == 30:
            res = model.predict(np.expand_dims(sequence, axis=0))[0]
            print(actions[np.argmax(res)])

            # Viz logic
            if res[np.argmax(res)] > threshold: 
                if len(sentence) > 0: 
                    if actions[np.argmax(res)] != sentence[-1]:
                        sentence.append(actions[np.argmax(res)])
                else:
                    sentence.append(actions[np.argmax(res)])

                if len(sentence) > 5: 
                    sentence = sentence[-5:]

                # Viz probabilities
                image = prob_viz(res, actions, image, colors)

                if len(sentence) > 0:
                    last_word = sentence[-1]
                    # Phát âm từ cuối cùng được nhận diện
                    engine.say(last_word)
                    engine.runAndWait()

        cv2.rectangle(image, (0,0), (640, 40), (245, 117, 16), -1)
        cv2.putText(image, ' '.join(sentence), (3,30), 
                       cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2, cv2.LINE_AA)

        # Show to screen
        cv2.imshow('OpenCV Feed', image)

        # Break gracefully
        if cv2.waitKey(10) & 0xFF == ord('q'):
            break

    cap.release()
    cv2.destroyAllWindows()


Xin chào
Xin chào
Xin chào
Xin chào
Xin chào
Xin chào
Xin chào
Xin chào
Xin chào
Xin chào
Xin chào
Xin chào
Xin chào
Xin chào
Xin chào
Xin chào
Thầy Cô
Xin chào
Thầy Cô
Xin chào
Xin chào
Xin chào
Xin chào
Xin chào
Xin chào
Thầy Cô
Xin chào
Thầy Cô
Thầy Cô
Xin chào
Xin chào
Thầy Cô
Thầy Cô
Thầy Cô
Thầy Cô
Thầy Cô
Thầy Cô
Thầy Cô
Thầy Cô
Thầy Cô
Xin chào
Thầy Cô
Thầy Cô
Thầy Cô
Thầy Cô
Thầy Cô
Xin chào
Xin chào
Xin chào
Xin chào
Xin chào
Thầy Cô
Xin chào
Thầy Cô
Xin chào
Xin chào
Thầy Cô
Xin chào
Xin chào
Xin chào
Xin chào
Thầy Cô
Xin chào
Xin chào
Xin chào
Xin chào
Xin chào
Xin chào
Xin chào
Xin chào
Thầy Cô
Xin chào
Thầy Cô
Thầy Cô
Xin chào
