In [6]:
import cv2
import mediapipe as mp

# Khởi tạo MediaPipe Pose
mp_pose = mp.solutions.pose
pose = mp_pose.Pose()

# Hàm để vẽ keypoints
def draw_keypoints(image, keypoints, keypoint_ids):
    tmp_lst = []
    for idx, keypoint in enumerate(keypoints):
        if idx in keypoint_ids:
            x = int(keypoint.x * image.shape[1])
            y = int(keypoint.y * image.shape[0])
            cv2.circle(image, (x, y), 5, (0, 255, 0), -1)
            tmp_lst.append([x, y])

    return tmp_lst

# Mở video
cap = cv2.VideoCapture(r'crop_video\017_002_003_cropped.mp4')
lst = []

while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break

    # Thay đổi kích thước frame xuống một nửa
    frame = cv2.resize(frame, (frame.shape[1] // 2, frame.shape[0] // 2))

    # Xử lý frame
    results = pose.process(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))

    if results.pose_landmarks:
        # Hiển thị các keypoints bạn chọn (ví dụ: 0, 1, 2)
        tmp_lst = draw_keypoints(frame, results.pose_landmarks.landmark, [15, 17, 19, 21, 16, 18, 20, 22, 0, 2, 5])
        lst.append(tmp_lst)

    cv2.imshow('Keypoints in Video', frame)
    if cv2.waitKey(1) & 0xFF == 27:  # Nhấn ESC để thoát
        break

cap.release()
cv2.destroyAllWindows()

In [5]:
print(lst)

[[[239, 177], [261, 156], [218, 157], [377, 550], [188, 428], [387, 592], [224, 432], [368, 581], [230, 411], [363, 560], [219, 406]], [[240, 175], [261, 155], [218, 156], [377, 551], [185, 427], [387, 594], [221, 431], [368, 588], [228, 410], [363, 567], [217, 406]], [[240, 174], [261, 154], [218, 155], [377, 551], [183, 427], [386, 595], [219, 431], [368, 591], [226, 410], [362, 570], [216, 406]], [[240, 173], [261, 154], [218, 154], [377, 552], [182, 427], [385, 599], [218, 430], [367, 594], [225, 409], [361, 572], [215, 406]], [[240, 173], [261, 154], [218, 154], [377, 554], [182, 427], [385, 602], [217, 430], [367, 596], [225, 409], [361, 574], [214, 406]], [[240, 173], [261, 154], [218, 154], [377, 556], [181, 427], [384, 605], [217, 430], [367, 599], [225, 409], [361, 578], [214, 406]], [[240, 173], [261, 154], [218, 154], [377, 556], [181, 427], [384, 605], [217, 429], [367, 598], [225, 409], [361, 577], [214, 405]], [[240, 173], [261, 154], [218, 154], [376, 556], [181, 427], 

In [1]:
import cv2
import mediapipe as mp
import os
import pandas as pd
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm

# Khởi tạo MediaPipe Pose
mp_pose = mp.solutions.pose
pose = mp_pose.Pose()

# Hàm để vẽ và lấy keypoints
def draw_keypoints(image, keypoints, keypoint_ids):
    tmp_lst = []
    for idx, keypoint in enumerate(keypoints):
        if idx in keypoint_ids:
            x = int(keypoint.x * image.shape[1])
            y = int(keypoint.y * image.shape[0])
            tmp_lst.append([x, y])
    return tmp_lst

# Hàm để xử lý và lưu keypoints của một video vào CSV
def process_and_save_video(video_path, extracted_data_folder, keypoint_ids):
    cap = cv2.VideoCapture(video_path)
    all_keypoints = []

    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break
        results = pose.process(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))

        if results.pose_landmarks:
            keypoints = draw_keypoints(frame, results.pose_landmarks.landmark, keypoint_ids)
            all_keypoints.append(keypoints)

    cap.release()
    # Lưu keypoints vào CSV
    video_name = os.path.basename(video_path).split('.')[0]
    csv_path = os.path.join(extracted_data_folder, video_name + '.csv')
    pd.DataFrame(all_keypoints).to_csv(csv_path, index=False)

# Đường dẫn đến folder chứa video
video_folder_path = 'reframe_video'

# Thư mục để lưu trữ file CSV
extracted_data_folder = 'extracted_data'
if not os.path.exists(extracted_data_folder):
    os.makedirs(extracted_data_folder)

# ID của keypoints mà bạn quan tâm
keypoint_ids = [15, 17, 19, 21, 16, 18, 20, 22, 0, 2, 5]

# Tạo danh sách đường dẫn video
video_paths = [os.path.join(video_folder_path, video_file) for video_file in os.listdir(video_folder_path) if video_file.endswith('.mp4')]

# Xử lý các video và lưu vào CSV sử dụng đa luồng
with ThreadPoolExecutor(max_workers=1) as executor:
    list(tqdm(executor.map(lambda p: process_and_save_video(p, extracted_data_folder, keypoint_ids), video_paths), total=len(video_paths)))


100%|██████████| 2982/2982 [2:18:51<00:00,  2.79s/it]  


In [3]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
import os
from tqdm import tqdm

# Helper function to convert string keypoints to float lists
def convert_keypoints(kp_str):
    kp_str = kp_str.strip('[]')
    return np.array(kp_str.split(',')).astype(float)

# Tải và chuẩn bị dữ liệu
data_folder = 'extracted_data'
X, y_raw = [], []
label_mapping = {}
for csv_file in tqdm(os.listdir(data_folder)):
    try:
        if csv_file.endswith('.csv'):
            label = int(csv_file.split('_')[0])  # Lấy label từ tên file
            if label not in label_mapping:
                label_mapping[label] = len(label_mapping)  # Map label to a unique index
            mapped_label = label_mapping[label]
            df = pd.read_csv(os.path.join(data_folder, csv_file), converters={i: convert_keypoints for i in range(11)})
            keypoints = np.array(df.values.tolist())  # Convert DataFrame rows to list of keypoints
            X.append(keypoints)
            y_raw.append(mapped_label)
    except Exception as e:
        print(f"Error processing {csv_file}: {e}")

# Padding các chuỗi dữ liệu
max_length = max(len(sequence) for sequence in X)
X_pad = pad_sequences(X, maxlen=max_length, padding='post', dtype='float32')

# Chia dữ liệu thành tập huấn luyện và kiểm thử
X_train, X_test, y_train_raw, y_test_raw = train_test_split(X_pad, y_raw, test_size=0.2, random_state=42)

# Get the number of classes based on the unique labels found
num_classes = len(np.unique(y_raw))

# Convert the raw labels to categorical
y_train_cat = to_categorical(y_train_raw, num_classes=num_classes)
y_test_cat = to_categorical(y_test_raw, num_classes=num_classes)

# Reshape X_train and X_test to have the shape (num_samples, timesteps, num_features)
X_train = X_train.reshape((X_train.shape[0], X_train.shape[1], -1))
X_test = X_test.reshape((X_test.shape[0], X_test.shape[1], -1))

100%|██████████| 2982/2982 [00:10<00:00, 276.35it/s]


In [3]:
print(max_length)

120


In [21]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
from tensorflow.keras.optimizers import Adam
import numpy as np

# Make sure you have executed the data preprocessing steps before running this code
# X_train, X_test, y_train_cat, and y_test_cat should already be defined

# Determine the number of classes for the output layer
num_classes = y_train_cat.shape[1]

# Xây dựng mô hình LSTM
model = Sequential()
model.add(LSTM(64, return_sequences=True, activation = 'tanh', input_shape=(X_train.shape[1], X_train.shape[2])))
model.add(Dropout(0.25))
model.add(LSTM(128, return_sequences=True, activation='tanh'))
model.add(Dropout(0.25))
model.add(LSTM(64, return_sequences=False, activation='tanh'))
model.add(Dropout(0.25))
model.add(Dense(64, activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(num_classes, activation='softmax'))  # Use the determined number of classes here

# Define the Adam optimizer with the initial learning rate
adam_optimizer = Adam(learning_rate=0.00001)

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Tạo callback EarlyStopping
early_stop = EarlyStopping(monitor='val_loss', patience=10)

# Tạo callback để lưu mô hình với val_accuracy tốt nhất
checkpoint_path = "model/model_checkpoint_1.h5"
checkpoint = ModelCheckpoint(checkpoint_path, monitor='val_accuracy', verbose=1, save_best_only=True, mode='max')

# Huấn luyện mô hình với callback
history = model.fit(X_train, y_train_cat,batch_size=32, epochs=100, validation_data=(X_test, y_test_cat), callbacks=[checkpoint, early_stop], verbose=1)

Epoch 1/100
Epoch 1: val_accuracy improved from -inf to 0.64154, saving model to model\model_checkpoint_1.h5
Epoch 2/100
Epoch 2: val_accuracy improved from 0.64154 to 0.71524, saving model to model\model_checkpoint_1.h5
Epoch 3/100
Epoch 3: val_accuracy improved from 0.71524 to 0.80737, saving model to model\model_checkpoint_1.h5
Epoch 4/100
Epoch 4: val_accuracy did not improve from 0.80737
Epoch 5/100
Epoch 5: val_accuracy did not improve from 0.80737
Epoch 6/100
Epoch 6: val_accuracy did not improve from 0.80737
Epoch 7/100
Epoch 7: val_accuracy improved from 0.80737 to 0.84757, saving model to model\model_checkpoint_1.h5
Epoch 8/100
Epoch 8: val_accuracy did not improve from 0.84757
Epoch 9/100
Epoch 9: val_accuracy did not improve from 0.84757
Epoch 10/100
Epoch 10: val_accuracy did not improve from 0.84757
Epoch 11/100
Epoch 11: val_accuracy did not improve from 0.84757
Epoch 12/100
Epoch 12: val_accuracy did not improve from 0.84757
Epoch 13/100
Epoch 13: val_accuracy did not i

In [15]:
model.summary()

Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_15 (LSTM)              (None, 120, 64)           22272     
                                                                 
 dropout_15 (Dropout)        (None, 120, 64)           0         
                                                                 
 lstm_16 (LSTM)              (None, 120, 128)          98816     
                                                                 
 dropout_16 (Dropout)        (None, 120, 128)          0         
                                                                 
 lstm_17 (LSTM)              (None, 64)                49408     
                                                                 
 dropout_17 (Dropout)        (None, 64)                0         
                                                                 
 dense_15 (Dense)            (None, 64)               

In [22]:
# Đánh giá mô hình
loss, accuracy = model.evaluate(X_test, y_test_cat)
print(f"Test Accuracy: {accuracy*100:.2f}%")

Test Accuracy: 83.75%


In [23]:
import os

def predict_action(csv_file_path, model, label_mapping):
    df = pd.read_csv(csv_file_path, converters={i: convert_keypoints for i in range(11)})
    keypoints = np.array(df.values.tolist())
    keypoints_pad = pad_sequences([keypoints], maxlen=max_length, padding='post', dtype='float32').reshape((1, max_length, -1))
    prediction = model.predict(keypoints_pad)
    action_class = np.argmax(prediction)
    # Reverse map to original label
    original_label = {v: k for k, v in label_mapping.items()}[action_class]
    return original_label

# Dự đoán hành động từ một file CSV mới
directory = r"test_csv" 

for filename in os.listdir(directory):
    try:
        if filename.endswith(".csv"):
            file_path = os.path.join(directory, filename)
            predicted_action = predict_action(file_path, model, label_mapping)
            print(f"Hành động được dự đoán là: {predicted_action}\nHành động thực tế là: {filename.split('_')[0]}")
    except:
        print(f"cannot read file {filename}")

Hành động được dự đoán là: 17
Hành động thực tế là: 017
Hành động được dự đoán là: 21
Hành động thực tế là: 021
cannot read file 021_006_005_cropped_shrinking_linearly.csv
Hành động được dự đoán là: 26
Hành động thực tế là: 026
Hành động được dự đoán là: 33
Hành động thực tế là: 033
Hành động được dự đoán là: 50
Hành động thực tế là: 050
Hành động được dự đoán là: 64
Hành động thực tế là: 064
Hành động được dự đoán là: 64
Hành động thực tế là: 064


In [10]:
import cv2
import mediapipe as mp
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Khởi tạo MediaPipe Pose
mp_pose = mp.solutions.pose
pose = mp_pose.Pose()

# Hàm để trích xuất và lấy keypoints cần thiết
def extract_keypoints(results, keypoint_ids):
    keypoints = []
    for idx in keypoint_ids:
        keypoint = results.pose_landmarks.landmark[idx]
        keypoints.extend([keypoint.x, keypoint.y, keypoint.z, keypoint.visibility])
    return keypoints

# Xử lý video từ webcam và trích xuất keypoints
def process_webcam_stream(model, keypoint_ids, max_length):
    cap = cv2.VideoCapture(r"test_video\real_test_1.mp4")
    sequence = []

    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break

        frame = cv2.resize(frame, (960, 1080))
        # Xử lý frame để trích xuất keypoints
        frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        results = pose.process(frame_rgb)

        if results.pose_landmarks:
            keypoints = extract_keypoints(results, keypoint_ids)
            sequence.append(keypoints)

            # Dự đoán hành động nếu đủ số lượng keypoints
            if len(sequence) == max_length:
                action = predict_action(sequence, model, max_length)
                print(f"Predicted Action: {action}")
                sequence = []  # Reset sequence sau mỗi dự đoán

        cv2.imshow('Webcam Feed', frame)
        if cv2.waitKey(1) & 0xFF == ord('q'):  # Nhấn 'q' để thoát
            break

    cap.release()
    cv2.destroyAllWindows()

# Hàm để dự đoán hành động từ chuỗi keypoints
def predict_action(sequence, model, max_length):
    sequence_pad = pad_sequences([sequence], maxlen=max_length, padding='post', dtype='float32')
    prediction = model.predict(sequence_pad)
    action_class = np.argmax(prediction)
    original_label = {v: k for k, v in label_mapping.items()}[action_class]
    return original_label

# ID của keypoints mà bạn quan tâm
keypoint_ids = [15, 17, 19, 21, 16, 18, 20, 22, 0, 2, 5]

max_length = 120

# Gọi hàm xử lý video từ webcam
process_webcam_stream(model, keypoint_ids, max_length)


Predicted Action: 64
Predicted Action: 44
Predicted Action: 33
Predicted Action: 64
Predicted Action: 44
Predicted Action: 51
Predicted Action: 26


In [12]:
{v: k for k, v in label_mapping.items()}

{0: 17, 1: 21, 2: 26, 3: 33, 4: 39, 5: 44, 6: 50, 7: 51, 8: 56, 9: 64}

In [None]:
import cv2
import mediapipe as mp
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Hàm để dự đoán hành động từ chuỗi keypoints
def predict_action(sequence, model, max_length):
    sequence_pad = pad_sequences([sequence], maxlen=max_length, padding='post', dtype='float32')
    prediction = model.predict(sequence_pad)
    action_class = np.argmax(prediction)
    original_label = {v: k for k, v in label_mapping.items()}[action_class]
    return original_label

# 1. New detection variables
sequence = []
sentence = []
predictions = []
threshold = 0.5

cap = cv2.VideoCapture(0)
# Set mediapipe model 
with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
    while cap.isOpened():

        # Read feed
        ret, frame = cap.read()

        # Make detections
        image, results = mediapipe_detection(frame, holistic)
        print(results)
        
        # Draw landmarks
        draw_styled_landmarks(image, results)
        
        # 2. Prediction logic
        keypoints = extract_keypoints(results)
        sequence.append(keypoints)
        sequence = sequence[-30:]
        
        if len(sequence) == 30:
            res = model.predict(np.expand_dims(sequence, axis=0))[0]
            print(actions[np.argmax(res)])
            predictions.append(np.argmax(res))
            
            
        #3. Viz logic
            if np.unique(predictions[-10:])[0]==np.argmax(res): 
                if res[np.argmax(res)] > threshold: 
                    
                    if len(sentence) > 0: 
                        if actions[np.argmax(res)] != sentence[-1]:
                            sentence.append(actions[np.argmax(res)])
                    else:
                        sentence.append(actions[np.argmax(res)])

            if len(sentence) > 5: 
                sentence = sentence[-5:]

            # Viz probabilities
            image = prob_viz(res, actions, image, colors)
            
        cv2.rectangle(image, (0,0), (640, 40), (245, 117, 16), -1)
        cv2.putText(image, ' '.join(sentence), (3,30), 
                       cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2, cv2.LINE_AA)
        
        # Show to screen
        cv2.imshow('OpenCV Feed', image)

        # Break gracefully
        if cv2.waitKey(10) & 0xFF == ord('q'):
            break
    cap.release()
    cv2.destroyAllWindows()