In [17]:
def mediapipe_detection(image, model):
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) 
    image.flags.writeable = False                  
    results = model.process(image)                 
    image.flags.writeable = True                  
    image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR) 
    return image, results



In [20]:
import cv2
import numpy as np
import os
from matplotlib import pyplot as plt
import mediapipe as mp
import tensorflow as tf
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split


mp_holistic = mp.solutions.holistic
mp_drawing = mp.solutions.drawing_utils


def augment_frame(frame):
    augmented_frames = []
    # Flip horizontally
    augmented_frames.append(cv2.flip(frame, 1))
    # Rotate
    rows, cols, _ = frame.shape
    M = cv2.getRotationMatrix2D((cols/2, rows/2), 15, 1)
    augmented_frames.append(cv2.warpAffine(frame, M, (cols, rows)))
    # Add noise
    noise = np.random.randint(0, 50, (rows, cols, 3), dtype='uint8')
    augmented_frames.append(cv2.add(frame, noise))
    return augmented_frames



def extract_keypoints(results):
    pose = np.array([[res.x, res.y, res.z, res.visibility] for res in results.pose_landmarks.landmark]).flatten() if results.pose_landmarks else np.zeros(33*4)
    face = np.array([[res.x, res.y, res.z] for res in results.face_landmarks.landmark]).flatten() if results.face_landmarks else np.zeros(468*3)
    lh = np.array([[res.x, res.y, res.z] for res in results.left_hand_landmarks.landmark]).flatten() if results.left_hand_landmarks else np.zeros(21*3)
    rh = np.array([[res.x, res.y, res.z] for res in results.right_hand_landmarks.landmark]).flatten() if results.right_hand_landmarks else np.zeros(21*3)
    return np.concatenate([pose, face, lh, rh])



cap = cv2.VideoCapture('Video/QIPEC.mp4')
frames = []
while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break
    frames.append(frame)
cap.release()



augmented_frames = []
for frame in frames:
    augmented_frames.extend(augment_frame(frame))


# Process frames with mediapipe
sequences = []
with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
    for frame in augmented_frames:
        image, results = mediapipe_detection(frame, holistic)
        keypoints = extract_keypoints(results)
        sequences.append(keypoints)
sequence_length = 50
if len(sequences) < sequence_length:
    # Pad sequences with zeros if less than 50
    padding = np.zeros((sequence_length - len(sequences), sequences[0].shape[0]))
    sequences = np.vstack((sequences, padding))
else:
    sequences = sequences[:sequence_length]



data_path = 'test'  
if not os.path.exists(data_path):
    os.makedirs(data_path)


for i, keypoints in enumerate(sequences):
    np.save(os.path.join(data_path, f'{i}.npy'), keypoints)
# Convert to numpy array and reshape
sequences = np.array(sequences).reshape(-1, 50, 1662)

I0000 00:00:1722499130.692760   33472 gl_context_egl.cc:85] Successfully initialized EGL. Major : 1 Minor: 5
I0000 00:00:1722499130.693887   43152 gl_context.cc:357] GL version: 3.2 (OpenGL ES 3.2 Mesa 24.0.9-0ubuntu0.1), renderer: Mesa Intel(R) UHD Graphics (TGL GT1)
W0000 00:00:1722499130.744728   43140 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1722499130.757059   43146 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1722499130.757637   43145 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1722499130.757704   43144 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W00

In [21]:
np.load("test/0.npy")

array([ 0.50906348,  0.2875011 , -0.60989499, ...,  0.        ,
        0.        ,  0.        ])

In [4]:
tts = gTTS(text=text, lang='vi')


In [6]:
tts.save("output.mp3")

In [7]:
from playsound import playsound
playsound("output.mp3")

playsound is relying on another python subprocess. Please use `pip install pygobject` if you want playsound to run more efficiently.
