In [4]:
# Cell 1: Install required libraries (run this in your terminal or uncomment if running in a script)
%pip install moviepy librosa transformers torch torchvision mediapipe opencv-python scikit-learn


Note: you may need to restart the kernel to use updated packages.


In [1]:
# Cell 2: Import necessary libraries
import os
import numpy as np
import librosa
from moviepy import VideoFileClip
from transformers import Wav2Vec2Processor, Wav2Vec2Model
import torch
import mediapipe as mp
import cv2
from sklearn.decomposition import PCA

ModuleNotFoundError: No module named 'librosa'

In [None]:

# Cell 3: Define function to extract audio from video
def extract_audio(video_path, output_audio_path="temp_audio.wav"):
    video = VideoFileClip(video_path)
    video.audio.write_audiofile(output_audio_path, codec='pcm_s16le')
    return output_audio_path

In [None]:
# Cell 4: Define function to process audio using Wav2Vec2
def process_audio(audio_path):
    processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
    model = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-base-960h")
    waveform, sample_rate = librosa.load(audio_path, sr=16000)
    inputs = processor(waveform, sampling_rate=sample_rate, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**inputs)
    audio_embeddings = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
    return audio_embeddings

In [None]:
# Cell 5: Define function to extract visual features (lip movements)
def extract_visual_features(video_path):
    """
    Extracts visual features (e.g., lip movements) from the video using MediaPipe Face Mesh.
    :param video_path: Path to the video file.
    :return: Visual embeddings (mean of lip landmarks over all frames).
    """
    # Initialize MediaPipe Face Mesh
    mp_face_mesh = mp.solutions.face_mesh
    face_mesh = mp_face_mesh.FaceMesh(static_image_mode=False, max_num_faces=1)

    # Load video frames
    video = VideoFileClip(video_path)
    frames = [frame for frame in video.iter_frames()]

    # Extract lip landmarks from each frame
    lip_landmarks = []
    for frame in frames:
        # Convert frame to RGB
        frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

        # Process frame with MediaPipe
        results = face_mesh.process(frame_rgb)
        if results.multi_face_landmarks:
            # Extract lip landmarks (indices 0-19 correspond to lips)
            lip_points = [results.multi_face_landmarks[0].landmark[i] for i in range(0, 20)]
            lip_coords = [(p.x, p.y, p.z) for p in lip_points]
            lip_landmarks.append(np.array(lip_coords).flatten())

    # Compute mean of lip landmarks across all frames
    if lip_landmarks:
        visual_embeddings = np.mean(lip_landmarks, axis=0)
    else:
        raise ValueError("No face detected in the video.")

    return visual_embeddings

# Cell 6: Define function to compute mismatch score
def compute_mismatch_score(audio_embeddings, visual_embeddings):
    """
    Computes the mismatch score between audio and visual embeddings.
    :param audio_embeddings: Audio embeddings (shape: (768,)).
    :param visual_embeddings: Visual embeddings (shape: (60,)).
    :return: Mismatch score.
    """
    # Normalize embeddings
    audio_embeddings = audio_embeddings / np.linalg.norm(audio_embeddings)

    # Pad visual embeddings to match audio embeddings size
    padded_visual_embeddings = np.pad(
        visual_embeddings,
        (0, audio_embeddings.shape[0] - visual_embeddings.shape[0]),
        mode='constant'
    )
    padded_visual_embeddings = padded_visual_embeddings / np.linalg.norm(padded_visual_embeddings)

    # Compute cosine similarity
    similarity = np.dot(audio_embeddings, padded_visual_embeddings)
    mismatch_score = 1 - similarity  # Higher score indicates higher mismatch

    return mismatch_score

In [None]:

# Cell 6: Define function to compute mismatch score
def compute_mismatch_score(audio_embeddings, visual_embeddings):
    audio_embeddings = audio_embeddings / np.linalg.norm(audio_embeddings)
    visual_embeddings = visual_embeddings / np.linalg.norm(visual_embeddings)
    similarity = np.dot(audio_embeddings, visual_embeddings)
    mismatch_score = 1 - similarity
    return mismatch_score

In [None]:

# Cell 7: Main function to analyze video
def analyze_video(video_path):
    audio_path = extract_audio(video_path)
    audio_embeddings = process_audio(audio_path)
    visual_embeddings = extract_visual_features(video_path)
    mismatch_score = compute_mismatch_score(audio_embeddings, visual_embeddings)
    if os.path.exists(audio_path):
        os.remove(audio_path)
    return mismatch_score

In [None]:
# Cell 8: Run analysis on a sample video
if __name__ == "__main__":
    video_path = "000471.mp4"  # Replace with your video file path
    mismatch_score = analyze_video(video_path)
    print(f"Mismatch Score: {mismatch_score:.4f}")
    if mismatch_score < 0.5:
        print("Result: Audio and visual content are well-aligned.")
    elif 0.5 <= mismatch_score <= 1:
        print("Result: Moderate mismatch detected. Check for minor inconsistencies.")
    else:
        print("Result: High mismatch detected. Audio and visual content are inconsistent.")