In [43]:
import os
import cv2
from deepface import DeepFace
from moviepy.editor import VideoFileClip
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import tqdm
import librosa
import librosa.display

In [55]:
df = pd.read_csv("df_face_times_trailer10.csv", index_col=0)
df

Unnamed: 0,second,type,face_confidence,emotion,emotion_confidence,face_match_prior
1,0.1,no faces,,,,
2,0.2,no faces,,,,
3,0.3,no faces,,,,
4,0.4,no faces,,,,
5,0.5,no faces,,,,
...,...,...,...,...,...,...
1536,153.6,no faces,,,,
1537,153.7,no faces,,,,
1538,153.8,no faces,,,,
1539,153.9,no faces,,,,


In [56]:
df.type.value_counts()

type
no faces          676
multiple faces    515
single face       349
Name: count, dtype: int64

In [57]:
df[df.type=="single face"][["face_confidence", "emotion_confidence"]].describe()

Unnamed: 0,face_confidence,emotion_confidence
count,349.0,349.0
mean,0.997077,0.781977
std,0.010777,0.188649
min,0.92,0.34
25%,1.0,0.63
50%,1.0,0.82
75%,1.0,0.96
max,1.0,1.0


In [58]:
df[df.type=="single face"].emotion.value_counts()

emotion
neutral     106
angry        72
sad          69
happy        56
fear         33
surprise     13
Name: count, dtype: int64

In [59]:
df_filter = df[df.type == "single face"]
df_filter = df_filter[df_filter.emotion_confidence > 0.6]
df_filter = df_filter[df_filter.second > 60]
len(df_filter)/len(df)

0.10649350649350649

In [60]:
df_filter

Unnamed: 0,second,type,face_confidence,emotion,emotion_confidence,face_match_prior
612,61.2,single face,1.00,neutral,0.82,False
667,66.7,single face,1.00,happy,0.95,False
671,67.1,single face,1.00,happy,0.74,False
673,67.3,single face,1.00,neutral,1.00,False
674,67.4,single face,1.00,angry,0.77,True
...,...,...,...,...,...,...
1352,135.2,single face,0.99,fear,0.92,False
1354,135.4,single face,0.98,happy,0.71,False
1356,135.6,single face,0.98,fear,0.96,False
1358,135.8,single face,0.94,sad,0.94,False


In [None]:
def cut_subclips(row, video_clip, output_dir):
    start_time = round(row['second'] - 60, 1)
    end_time = round(row['second'] - 0.1, 1) 
    emotion = row['emotion']
    
    # Write the images file
    images_clip = video_clip.subclip(start_time, end_time).without_audio()
    path = os.path.join(output_dir, f"{start_time}_{end_time}_{emotion}_clip_images.mp4")
    images_clip.write_videofile(path, fps=10)
    
    # Write the audio file
    audio_clip = video_clip.audio.subclip(start_time, end_time)
    path = os.path.join(output_dir, f"{start_time}_{end_time}_{emotion}_clip_audio.mp3")
    audio_clip.write_audiofile(path)

# Create several subclips
video_clip = VideoFileClip("barbie10smallest.mp4")
output_dir = "barbie_subclips"
results = df_filter.apply(lambda row: cut_subclips(row, video_clip, output_dir), axis=1)

video_clip.close()

In [61]:
def audio2mel(row, output_dir):
    start_time = round(row['second'] - 60, 1)
    end_time = round(row['second'] - 0.1, 1) 
    emotion = row['emotion']
    
    # Load audio file: y is the audio time series, sr is the sampling rate
    audio_path = os.path.join(output_dir, f"{start_time}_{end_time}_{emotion}_clip_audio.mp3")
    y, sr = librosa.load(audio_path, sr=None)  
    # Compute Mel spectrogram
    S = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128, fmax=8000)
    # Convert to decibels
    S_dB = librosa.power_to_db(S, ref=np.max)
    
    # Plot Mel spectrogram
    plt.figure(figsize=(30, 12)) 
    librosa.display.specshow(S_dB, sr=sr)
    output_path = os.path.join(output_dir, f"{start_time}_{end_time}_{emotion}_clip_melaudio.png")
    plt.savefig(output_path)
    plt.close()

# Create several subclips
output_dir = "trailer_subclips"
results = df_filter.apply(lambda row: audio2mel(row, output_dir), axis=1)
