In [1]:
import cv2
import face_recognition
import mediapipe as mp
from moviepy import VideoFileClip, CompositeAudioClip
from tqdm import tqdm
from deepface import DeepFace




In [2]:
# Function to check if arm is up
def is_arm_up(landmarks, mp_pose):
    left_eye = landmarks[mp_pose.PoseLandmark.LEFT_EYE.value]
    right_eye = landmarks[mp_pose.PoseLandmark.RIGHT_EYE.value]
    left_elbow = landmarks[mp_pose.PoseLandmark.LEFT_ELBOW.value]
    right_elbow = landmarks[mp_pose.PoseLandmark.RIGHT_ELBOW.value]

    left_arm_up = left_elbow.y < left_eye.y
    right_arm_up = right_elbow.y < right_eye.y

    return left_arm_up or right_arm_up

In [3]:
# Function to detect if a person is sitting or standing
def detect_posture(landmarks, mp_pose):
    try:
        # Ensure all necessary landmarks are available
        required_landmarks = [
            mp_pose.PoseLandmark.LEFT_HIP, 
            mp_pose.PoseLandmark.RIGHT_HIP, 
            mp_pose.PoseLandmark.LEFT_KNEE, 
            mp_pose.PoseLandmark.RIGHT_KNEE, 
            mp_pose.PoseLandmark.LEFT_ANKLE, 
            mp_pose.PoseLandmark.RIGHT_ANKLE
        ]
        
        for landmark in required_landmarks:
            if landmark not in landmarks or landmarks[landmark] is None:
                return False

        # Extract the relevant landmarks: hips, knees, and ankles
        hip_y = (landmarks[mp_pose.PoseLandmark.LEFT_HIP].y + 
                landmarks[mp_pose.PoseLandmark.RIGHT_HIP].y) / 2
        knee_y = (landmarks[mp_pose.PoseLandmark.LEFT_KNEE].y + 
                landmarks[mp_pose.PoseLandmark.RIGHT_KNEE].y) / 2
        ankle_y = (landmarks[mp_pose.PoseLandmark.LEFT_ANKLE].y + 
                landmarks[mp_pose.PoseLandmark.RIGHT_ANKLE].y) / 2
    except (AttributeError, IndexError, KeyError):
        # Return False if landmarks are missing or invalid
        return False

    # Check relative distances
    if (hip_y - knee_y) < 0.2 and (knee_y - ankle_y) < 0.2:
        return "Sitting"
    else:
        return "Standing"

In [4]:
def detect_poses_faces_and_emotions(video_path, output_path, report_path):
    cap = cv2.VideoCapture(video_path)

    if not cap.isOpened():
        print("There was an error opening the video.")
        return
    
    # Initialize mediaPipe for pose detection
    mp_pose = mp.solutions.pose
    pose = mp_pose.Pose()
    mp_drawing = mp.solutions.drawing_utils
    
    # Codec
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')  # Codec para MP4

    # Video properties
    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    fps = int(cap.get(cv2.CAP_PROP_FPS))
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))

    # Video writer
    out = cv2.VideoWriter(output_path, fourcc, fps, (width, height))
    
    # Initialize variables for arm movement detection
    arm_up = False
    arm_movements_count = 0
    
    # Initialize variables for emotion detection
    emotion = ''

    # Initialize frame counter
    i=1
    # Frame processing loop
    for _ in tqdm(range(total_frames), desc="Processing"):    
        # Get the current frame
        ret, frame = cap.read()
        if not ret:
            break

        # Analyse the frame for faces and emotions
        face_results = DeepFace.analyze(frame, actions=['emotion'], enforce_detection=False)

        # Convert the frame to RGB
        rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        
        # Detect poses using MediaPipe
        pose_results = pose.process(rgb_frame)
        
        # Get the face locations and encodings
        face_locations = face_recognition.face_locations(rgb_frame)
        face_encodings = face_recognition.face_encodings(rgb_frame, face_locations)
        
        # Verify pose landmarks on the frame
        if pose_results.pose_landmarks:
            # Draw pose landmarks on the frame
            mp_drawing.draw_landmarks(frame, pose_results.pose_landmarks, mp_pose.POSE_CONNECTIONS)

            # Verify if the arm is up
            if is_arm_up(pose_results.pose_landmarks.landmark, mp_pose):
                if not arm_up:
                    arm_up = True
                    arm_movements_count += 1
                    with open(report_path, 'a') as f:
                        f.write(f"on frame {i}: Arm Movement Up Detected\n")
            else:
                arm_up = False

            # Show arm movements count on the frame
            cv2.putText(frame, f'Arm movements: {arm_movements_count}', (10, 30),
                        cv2.FONT_HERSHEY_SIMPLEX, 0.7, (255, 255, 255), 2, cv2.LINE_AA)
            
            # Try to detect if the person is sitting or standing
            posture = detect_posture(pose_results.pose_landmarks.landmark, mp_pose)
            if posture:
                if posture == "Sitting":
                    with open(report_path, 'a') as f:
                        f.write(f"on frame {i}: Sitting person detected\n")
                if posture == "Standing":
                    with open(report_path, 'a') as f:
                        f.write(f"on frame {i}: Standing person detected\n")

        # Loop through each face detected
        for face in face_results:
            # Get the bounding box coordinates
            x, y, w, h = face['region']['x'], face['region']['y'], face['region']['w'], face['region']['h']
            
            # Get the emotion
            dominant_emotion = face['dominant_emotion']

            # Draw around the face
            cv2.rectangle(frame, (x, y), (x+w, y+h), (0, 255, 0), 2)

            # Write the emotion above the face
            cv2.putText(frame, dominant_emotion, (x, y-10), cv2.FONT_HERSHEY_SIMPLEX, 0.9, (36, 255, 12), 2)
            
            # Write the face emotion to a report file
            with open(report_path, 'a') as f:
                if emotion != dominant_emotion:
                    emotion = dominant_emotion
                    f.write(f"on frame {i}: Face Detected with Dominant Emotion: {emotion}\n") 
        
        # Write the processed frame to the output video
        out.write(frame)
        
        # Exibir o frame processado
        # cv2.imshow('Video', frame)
        # if cv2.waitKey(1) & 0xFF == ord('q'):
        #     break
        
        # if i==200:
        #     break
        i+=1

    # Free resources
    cap.release()
    out.release()
    cv2.destroyAllWindows()

In [5]:
def add_audio_to_video(input_video, processed_video, output_video):
    # Load original video to extract audio
    original_video = VideoFileClip(input_video)
    
    # Check if the original video has audio
    if original_video.audio is None:
        print("Error: Original video has no audio.")
        return

    # Load processed video (no audio)
    processed_video_clip = VideoFileClip(processed_video)

    new_audioclip = CompositeAudioClip([original_video.audio])

    # Combine processed video with original audio
    processed_video_clip.audio = new_audioclip

    # Write the output video with audio
    processed_video_clip.write_videofile(output_video, codec="libx264", audio_codec="aac")

In [6]:
detect_poses_faces_and_emotions('video.mp4', 'output_video_without_sound.mp4', 'report.txt')

Processing: 100%|██████████| 3326/3326 [1:18:43<00:00,  1.42s/it]


In [7]:
# Add audio back
add_audio_to_video("video.mp4", "output_video_without_sound.mp4", "final_video_with_audio.mp4")

MoviePy - Building video final_video_with_audio.mp4.
MoviePy - Writing audio in final_video_with_audioTEMP_MPY_wvf_snd.mp4


                                                                      

MoviePy - Done.
MoviePy - Writing video final_video_with_audio.mp4



                                                                           

MoviePy - Done !
MoviePy - video ready final_video_with_audio.mp4
