In [10]:
import cv2
import numpy as np
import librosa
import speech_recognition as sr
from transformers import pipeline
from deepface import DeepFace

# Load models
visual_emotion_model = DeepFace
text_sentiment_model = pipeline("sentiment-analysis")  # Hugging Face transformers
audio_emotion_model = None  # Placeholder for a trained audio emotion recognition model



No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Device set to use cpu


In [1]:
# Video Processing
def process_video(video_path):
    # Load video
    cap = cv2.VideoCapture(video_path)

    # Load audio
    audio_path = extract_audio(video_path)
    audio, sr_rate = librosa.load(audio_path, sr=16000)

    recognizer = sr.Recognizer()
    audio_chunks = split_audio(audio_path)  # Split audio into chunks (function defined later)

    frame_rate = int(cap.get(cv2.CAP_PROP_FPS))
    frame_count = 0

    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break

        # Process every nth frame based on frame rate
        if frame_count % frame_rate == 0:
            # Process Visual Stream
            visual_emotion = get_visual_emotion(frame)
            
            # Process Audio Stream
            audio_emotion = None
            # if audio_chunks:
            #     audio_chunk = audio_chunks.pop(0)
            #     audio_emotion = get_audio_emotion(audio_chunk, sr_rate)

            # Process Text Stream
            text_emotion = None
            # if audio_chunks:
            #     text_emotion = get_text_emotion(audio_chunk, recognizer)
            
            # Print or aggregate the results
            print(f"Frame {frame_count}: Visual: {visual_emotion}, Audio: {audio_emotion}, Text: {text_emotion}")
        
        frame_count += 1

    cap.release()

import subprocess

def extract_audio(video_path):
    audio_path = "temp_audio.wav"
    command = [
        "ffmpeg",
        "-i", video_path,
        "-ac", "1",
        audio_path
    ]
    subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    return audio_path



# Helper: Split audio into chunks
def split_audio(audio_path, chunk_duration=3):
    audio, sr_rate = librosa.load(audio_path, sr=16000)
    chunk_samples = chunk_duration * sr_rate
    return [audio[i:i + chunk_samples] for i in range(0, len(audio), chunk_samples)]


import cv2

from deepface import DeepFace

# Load face cascade classifier
face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_frontalface_default.xml')


# Helper: Get visual emotion
def get_visual_emotion(frame):
        # Convert frame to grayscale
    gray_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)

    # Convert grayscale frame to RGB format
    rgb_frame = cv2.cvtColor(gray_frame, cv2.COLOR_GRAY2RGB)

    # Detect faces in the frame
    # faces = face_cascade.detectMultiScale(gray_frame, scaleFactor=1.1, minNeighbors=5, minSize=(30, 30))

    # for (x, y, w, h) in faces:
    #     # Extract the face ROI (Region of Interest)
    #     face_roi = rgb_frame[y:y + h, x:x + w]

        
    #     # Perform emotion analysis on the face ROI
    #     result = DeepFace.analyze(face_roi, actions=['emotion'], enforce_detection=False)

    #     # Determine the dominant emotion
    #     emotion = result[0]['dominant_emotion']

    #     # Draw rectangle around face and label with predicted emotion
    #     cv2.rectangle(frame, (x, y), (x + w, y + h), (0, 0, 255), 2)
    #     cv2.putText(frame, emotion, (x, y - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.9, (0, 0, 255), 2)

    # Display the resulting frame
    cv2.imshow('Real-time Emotion Detection', frame)
    

# Helper: Get audio emotion
def get_audio_emotion(audio_chunk, sr_rate):
    try:
        mfccs = librosa.feature.mfcc(y=audio_chunk, sr=sr_rate, n_mfcc=40)
        # Placeholder: Replace with pre-trained audio emotion recognition model
        return "Neutral"
    except Exception as e:
        return f"Error: {e}"

# Helper: Get text emotion
def get_text_emotion(audio_chunk, recognizer):
    try:
        with sr.AudioFile(audio_chunk) as source:
            audio = recognizer.record(source)
        text = recognizer.recognize_google(audio)
        sentiment = text_sentiment_model(text)
        return sentiment[0]['label']
    except Exception as e:
        return f"Error: {e}"

# Main Function
if __name__ == "__main__":
    video_path = "video.mp4"
    process_video(video_path)


KeyboardInterrupt: 