**Postech IA para Devs - Fase 4**

Tech Challenge

Grupo 4:


*   Marcos Carielo - rm357969
*   Priscila Nitta - rm357392
*   Renato Mello - rm357879
*   Vitor Soares - rm356986



O PROBLEMA

A criação de uma aplicação que utilize análise de vídeo. O projeto deve incorporar as técnicas de reconhecimento facial, análise de expressões emocionais em vídeos e detecção de atividades.

A PROPOSTA DO DESAFIO

Criar uma aplicação a partir de um vídeo, e que execute as seguintes tarefas:
1. Reconhecimento facial: Identifique e marque os rostos presentes no vídeo.
2. Análise de expressões emocionais: Analise as expressões emocionais dos rostos identificados.
3. Detecção de atividades: Detecte e categorize as atividades sendo realizadas no vídeo.
4. Geração de resumo: Crie um resumo automático das principais atividades e emoções detectadas no vídeo.

In [1]:
!pip install face-recognition
!pip install deepface
!pip install mediapipe

Collecting face-recognition
  Downloading face_recognition-1.3.0-py2.py3-none-any.whl.metadata (21 kB)
Collecting face-recognition-models>=0.3.0 (from face-recognition)
  Downloading face_recognition_models-0.3.0.tar.gz (100.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m100.1/100.1 MB[0m [31m19.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Downloading face_recognition-1.3.0-py2.py3-none-any.whl (15 kB)
Building wheels for collected packages: face-recognition-models
  Building wheel for face-recognition-models (setup.py) ... [?25l[?25hdone
  Created wheel for face-recognition-models: filename=face_recognition_models-0.3.0-py2.py3-none-any.whl size=100566162 sha256=2e96d892651379b09a26d9cddab9edab391d91cd67deda7aed21740a3f6b1bda
  Stored in directory: /root/.cache/pip/wheels/7a/eb/cf/e9eced74122b679557f597bb7c8e4c739cfcac526db1fd523d
Successfully built face-recognition-models
Installing collected packages: face-reco

In [2]:
import cv2
import numpy as np
import gc
from tqdm import tqdm
from deepface import DeepFace
import mediapipe as mp

25-02-16 22:38:21 - Directory /root/.deepface has been created
25-02-16 22:38:21 - Directory /root/.deepface/weights has been created


In [5]:
# Configuração do caminho do vídeo
VIDEO_PATH = '/content/video.mp4'
OUTPUT_VIDEO_PATH = '/content/video_result.mp4'
REPORT_PATH = '/content/relatorio_video.txt'

FRAME_SKIP = 1  # Analisa emoções a cada 1 frame. Esse valor pode ser alterado para melhorar o desempenho

# Inicializar o MediaPipe Pose
mp_pose = mp.solutions.pose
pose = mp_pose.Pose()
mp_drawing = mp.solutions.drawing_utils

def process_video_combined(video_path, output_path, report_path):
    cap = cv2.VideoCapture(video_path)
    frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    fps = int(cap.get(cv2.CAP_PROP_FPS))
    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))

    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    out = cv2.VideoWriter(output_path, fourcc, fps, (width, height))

    total_frames = 0
    emotions_summary = {}
    actions_summary = {}

    previous_hip_x = None
    previous_wrist_positions = []

    def identify_action(landmarks):
        if not landmarks:
            return None

        left_wrist = landmarks[mp_pose.PoseLandmark.LEFT_WRIST.value]
        right_wrist = landmarks[mp_pose.PoseLandmark.RIGHT_WRIST.value]
        left_elbow = landmarks[mp_pose.PoseLandmark.LEFT_ELBOW.value]
        right_elbow = landmarks[mp_pose.PoseLandmark.RIGHT_ELBOW.value]
        left_hip = landmarks[mp_pose.PoseLandmark.LEFT_HIP.value]
        right_hip = landmarks[mp_pose.PoseLandmark.RIGHT_HIP.value]
        left_shoulder = landmarks[mp_pose.PoseLandmark.LEFT_SHOULDER.value]
        right_shoulder = landmarks[mp_pose.PoseLandmark.RIGHT_SHOULDER.value]

        nonlocal previous_hip_x, previous_wrist_positions

        hip_x_avg = (left_hip.x + right_hip.x) / 2
        arm_distance = abs(left_wrist.x - left_shoulder.x) + abs(right_wrist.x - right_shoulder.x)

        if previous_hip_x is not None and abs(hip_x_avg - previous_hip_x) > 0.03 and arm_distance < 0.2:
            previous_hip_x = hip_x_avg
            return "Andando"

        previous_hip_x = hip_x_avg

        both_arms_moving_up = left_wrist.y < left_shoulder.y and right_wrist.y < right_shoulder.y
        wrists_positions = [(left_wrist.x, left_wrist.y), (right_wrist.x, right_wrist.y)]

        if both_arms_moving_up:
            previous_wrist_positions.append(wrists_positions)
            if len(previous_wrist_positions) > 10:
                previous_wrist_positions.pop(0)

            wrist_movement = sum(
                np.linalg.norm(np.array(previous_wrist_positions[i]) - np.array(previous_wrist_positions[i - 1]))
                for i in range(1, len(previous_wrist_positions))
            )

            if wrist_movement > 1.5:
                return "Dançando"

        if abs(left_wrist.y - left_elbow.y) < 0.1 and abs(right_wrist.y - right_elbow.y) < 0.1:
            return "Digitando no computador"

        if left_wrist.y < left_elbow.y or right_wrist.y < right_elbow.y:
            return "Dando tchau"

        if abs(left_wrist.x - right_wrist.x) < 0.05 and abs(left_wrist.y - right_wrist.y) < 0.05:
            return "Aperto de mão"

        return "Movimento anômalo"

    for i in tqdm(range(frame_count), desc='Processing Video'):
        ret, frame = cap.read()
        if not ret:
            break

        total_frames += 1
        rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

        emotion_text = "Emotion: Unknown"
        action_label = "Nenhuma ação detectada"

        if total_frames % FRAME_SKIP == 0:
            try:
                analysis = DeepFace.analyze(rgb_frame, actions=['emotion'], enforce_detection=False)
                for face in analysis:
                    x, y, w, h = face['region']['x'], face['region']['y'], face['region']['w'], face['region']['h']
                    emotion = face['dominant_emotion']
                    emotions_summary[emotion] = emotions_summary.get(emotion, 0) + 1
                    emotion_text = f'Emotion: {emotion}'
                    cv2.rectangle(frame, (x, y), (x + w, y + h), (0, 255, 0), 2)
                    cv2.putText(frame, emotion_text, (x, y - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.9, (255, 0, 0), 2)
            except Exception as e:
                print(f"Erro ao analisar emoções: {e}")

        results = pose.process(rgb_frame)
        if results.pose_landmarks:
            mp_drawing.draw_landmarks(frame, results.pose_landmarks, mp_pose.POSE_CONNECTIONS)
            action_label = identify_action(results.pose_landmarks.landmark)
            actions_summary[action_label] = actions_summary.get(action_label, 0) + 1

        cv2.putText(frame, action_label, (10, 50), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 0, 0), 2, cv2.LINE_AA)
        out.write(frame)

        if total_frames % 20 == 0:
            gc.collect()

    cap.release()
    out.release()

    # Criar relatório em TXT
    with open(report_path, "w", encoding="utf-8") as file:
        file.write(f"Relatório de Análise do Vídeo\n")
        file.write(f"------------------------------------\n")
        file.write(f"Total de Frames Analisados: {total_frames}\n\n")

        file.write(f"Distribuição das Emoções:\n")
        for emotion, count in emotions_summary.items():
            file.write(f"- {emotion}: {count} ocorrências\n")

        file.write(f"\nDistribuição das Ações Detectadas:\n")
        for action, count in actions_summary.items():
            file.write(f"- {action}: {count} ocorrências\n")

    print(f'Total frames: {total_frames}\nEmotion distribution: {emotions_summary}\nAction distribution: {actions_summary}')

# Chamar a função para processar o vídeo
process_video_combined(VIDEO_PATH, OUTPUT_VIDEO_PATH, REPORT_PATH)

Processing Video: 100%|██████████| 3326/3326 [15:51<00:00,  3.50it/s]

Total frames: 3326
Emotion distribution: {'happy': 961, 'sad': 877, 'angry': 126, 'fear': 516, 'neutral': 896, 'surprise': 148, 'disgust': 4}
Action distribution: {'Aperto de mão': 3, 'Movimento anômalo': 1032, 'Digitando no computador': 1017, 'Dando tchau': 675, 'Dançando': 46, 'Andando': 36}



