In [None]:
# Cell 1: Install required libraries (run this in your terminal or uncomment if running in a script)
%pip install moviepy librosa transformers torch torchvision mediapipe opencv-python matplotlib

In [None]:
# Cell 2: Import necessary libraries
import os
import numpy as np
import librosa
from moviepy import VideoFileClip
from transformers import Wav2Vec2Processor, Wav2Vec2Model
import torch
import mediapipe as mp
import cv2
import matplotlib.pyplot as plt

In [None]:
# Cell 3: Define function to extract audio from video
def extract_audio(video_path, output_audio_path="temp_audio.wav"):
    video = VideoFileClip(video_path)
    video.audio.write_audiofile(output_audio_path, codec='pcm_s16le')
    return output_audio_path

In [None]:
# Cell 4: Define function to process audio using Wav2Vec2
def process_audio(audio_path):
    processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
    model = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-base-960h")
    waveform, sample_rate = librosa.load(audio_path, sr=16000)
    inputs = processor(waveform, sampling_rate=sample_rate, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**inputs)
    audio_embeddings = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
    return audio_embeddings

In [None]:
# Cell 5: Define function to extract visual features (lip movements)
def extract_visual_features(video_path):
    mp_face_mesh = mp.solutions.face_mesh
    face_mesh = mp_face_mesh.FaceMesh(static_image_mode=False, max_num_faces=1)
    video = VideoFileClip(video_path)
    frames = [frame for frame in video.iter_frames()]
    lip_landmarks = []
    face_detection_success = 0
    for frame in frames:
        frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        results = face_mesh.process(frame_rgb)
        if results.multi_face_landmarks:
            face_detection_success += 1
            lip_points = [results.multi_face_landmarks[0].landmark[i] for i in range(0, 20)]
            lip_coords = [(p.x, p.y, p.z) for p in lip_points]
            lip_landmarks.append(np.array(lip_coords).flatten())
    if lip_landmarks:
        visual_embeddings = np.mean(lip_landmarks, axis=0)
    else:
        raise ValueError("No face detected in the video.")
    face_detection_rate = face_detection_success / len(frames)
    return visual_embeddings, face_detection_rate


In [None]:
# Cell 6: Define function to compute mismatch metrics
def compute_mismatch_metrics(audio_embeddings, visual_embeddings):
    # Normalize embeddings
    audio_embeddings = audio_embeddings / np.linalg.norm(audio_embeddings)

    # Pad visual embeddings to match audio embeddings size
    padded_visual_embeddings = np.pad(
        visual_embeddings,
        (0, audio_embeddings.shape[0] - visual_embeddings.shape[0]),
        mode='constant'
    )
    padded_visual_embeddings = padded_visual_embeddings / np.linalg.norm(padded_visual_embeddings)

    # Compute cosine similarity
    cosine_similarity = np.dot(audio_embeddings, padded_visual_embeddings)
    mismatch_score = 1 - cosine_similarity

    # Compute Euclidean distance
    euclidean_distance = np.linalg.norm(audio_embeddings - padded_visual_embeddings)

    return {
        "cosine_similarity": cosine_similarity,
        "mismatch_score": mismatch_score,
        "euclidean_distance": euclidean_distance
    }


In [None]:
# Cell 7: Main function to analyze video
def analyze_video(video_path):
    # Step 1: Extract audio from video
    audio_path = extract_audio(video_path)

    # Step 2: Process audio to get embeddings
    audio_embeddings = process_audio(audio_path)

    # Step 3: Extract visual features (lip movements)
    visual_embeddings, face_detection_rate = extract_visual_features(video_path)

    # Step 4: Compute mismatch metrics
    metrics = compute_mismatch_metrics(audio_embeddings, visual_embeddings)

    # Clean up temporary files
    if os.path.exists(audio_path):
        os.remove(audio_path)

    return metrics, face_detection_rate

In [None]:
# Cell 8: Run analysis on a sample video and save results as JSON
if __name__ == "__main__":
    import json
    
    video_path = "000471.mp4"  # Replace with your video file path
    metrics, face_detection_rate = analyze_video(video_path)
    
    # Add analysis result based on mismatch score
    if metrics['mismatch_score'] < 0.5:
        analysis_result = "Audio and visual content are well-aligned."
    elif 0.5 <= metrics['mismatch_score'] <= 1:
        analysis_result = "Moderate mismatch detected. Check for minor inconsistencies."
    else:
        analysis_result = "High mismatch detected. Audio and visual content are inconsistent."

In [None]:
import os
import json
# Define the path for the results.json file
results_file_path = 'results.json'

# Prepare the data to be saved
results_data = {
    "face_detection_rate": round(face_detection_rate * 100, 2),
    "cosine_similarity": round(float(metrics['cosine_similarity']), 2),
    "mismatch_score": round(float(metrics['mismatch_score']), 2),
    "euclidean_distance": round(float(metrics['euclidean_distance']), 2),
    "analysis_result": analysis_result
}

try:
    # If the file exists and is not empty, read and update existing data
    if os.path.exists(results_file_path) and os.path.getsize(results_file_path) > 0:
        with open(results_file_path, 'r') as file:
            existing_data = json.load(file)
        # Update the existing data with new results
        existing_data.update(results_data)
        results_data = existing_data
    
    # Write the results to the file
    with open(results_file_path, 'w') as file:
        json.dump(results_data, file, indent=4)
    
    print(f"Results saved to {results_file_path}: {results_data}")

except Exception as e:
    print(f"Error handling results file: {str(e)}")