# Syncnet DEMO

https://github.com/voletiv/syncnet-in-keras/

1. Given a video (*.mp4), convert to proper input format to the Syncnet lip & audio models
2. Load the Syncnet lip and audio models
3. Calculate lip-video and audio embeddings using Syncnet
4. Calculate Euclidian distance between the lip and audio embeddings to check if video/audio are in sync

In [None]:
import cv2
import dlib
import numpy as np
import os
import scipy.io.wavfile as wav
import speechpy

import syncnet_params

from syncnet_functions import load_pretrained_syncnet_model

# DETECT MOUTH IN FRAME USING process_lrw_functions
# FROM https://github.com/voletiv/lipreading-in-the-wild-experiments/tree/master/process-lrw
import sys
sys.path.append('../lipreading-in-the-wild-experiments/process-lrw')
from process_lrw_functions import detect_mouth_in_frame

# FUNCTIONS

## get_syncnet_lip_model_input

In [None]:
def get_syncnet_lip_model_input(video, shape_predictor_path="shape_predictor_68_face_landmarks.dat"):

    detector = dlib.get_frontal_face_detector()
    predictor = dlib.shape_predictor(shape_predictor_path)

    cap         = cv2.VideoCapture(video)
    frameFPS    = int(cap.get(cv2.CAP_PROP_FPS))
    frameCount  = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    frameWidth  = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    frameHeight = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))

    print("FPS: {}".format(frameFPS))
    print("Frames: {}".format(frameCount))
    print("Width: {}".format(frameWidth))
    print("Height: {}".format(frameHeight))

    # Default face rect
    face = dlib.rectangle(30, 30, 220, 220)

    lip_model_input = []

    frame_index = 0

    # Read frames from the video
    while(cap.isOpened()):

        frames = []
        for i in range(5):
        
            _, frame = cap.read()
            frame_index += 1
            # print("Frame", frame_index+1, "of", frameCount, end="\r")

            # If no frame is read, break
            if frame is None:
                break

            # Detect mouth in the frame
            mouth, _ = detect_mouth_in_frame(frame, detector, predictor,
                                                prevFace=face, verbose=False)

            # Convert mouth to grayscale
            mouth = cv2.cvtColor(mouth, cv2.COLOR_BGR2GRAY)

            # Resize mouth to syncnet input shape
            mouth = cv2.resize(mouth, (syncnet_params.MOUTH_H, syncnet_params.MOUTH_W))

            # Subtract 110 from all mouth values (Checked in syncnet_demo.m)
            mouth = mouth - 110.

            frames.append(mouth)

        if len(frames) == 5:
            stacked = np.stack(frames, axis=-1) #syncnet requires (112,112,5)
            lip_model_input.append(stacked)
        else:
            break

    return np.array(lip_model_input)

## get_syncnet_audio_model_input

In [None]:
def extract_syncnet_mfcc(wav_file, verbose=False):
    """To extract mfcc features of audio clips 0.2 seconds in length each,
    i.e. of 20 MFCC features in each clip (acc. to syncnet paper)
    Output mfcc_clips shape === (N, 12, 20, 1),
    where N = len(mfcc_features) // 20
    """

    rate, sig = wav.read(wav_file)
    if verbose:
        print("Sig length: {}, sample_rate: {}".format(len(sig), rate))

    try:
        mfcc_features = speechpy.feature.mfcc(sig, sampling_frequency=rate, frame_length=0.010, frame_stride=0.010)
    except IndexError:
        raise ValueError("ERROR: Index error occurred while extracting mfcc")

    if verbose:
        print("mfcc_features shape:", mfcc_features.shape)

    # Number of audio clips = len(mfcc_features) // length of each audio clip
    number_of_audio_clips = len(mfcc_features) // syncnet_params.AUDIO_TIME_STEPS

    if verbose:
        print("Number of audio clips:", number_of_audio_clips)

    # Don't consider the first MFCC feature, only consider the next 12 (Checked in syncnet_demo.m)
    # Also, only consider syncnet_params.AUDIO_TIME_STEPS*number_of_audio_clips features
    mfcc_features = mfcc_features[:syncnet_params.AUDIO_TIME_STEPS*number_of_audio_clips, 1:]

    # Reshape mfcc_features from (x, 12) to (x//20, 12, 20, 1)
    mfcc_features = np.expand_dims(np.transpose(np.split(mfcc_features, number_of_audio_clips), (0, 2, 1)), axis=-1)

    if verbose:
        print("Final mfcc_features shape:", mfcc_features.shape)

    return mfcc_features


def get_syncnet_audio_model_input(video):

    # Convert video's audio to .wav file
    audio_out = "{}.wav".format(video)
    command = "ffmpeg -y -loglevel panic -i {} -acodec pcm_s16le -ac 1 -ar 16000 {}".format(video, audio_out)
    os.system(command)

    # Extract proper input to syncnet_audio_model
    return extract_syncnet_mfcc(audio_out)

# 1. Given a video, convert to proper inputs to the Syncnet lip & audio models

**Make sure video is of 25fps!**
If not, use the following ffmpeg command to convert fps:

```
ffmpeg -i <video>.mp4 -r 25 -y <video_at_25_fps>.mp4
```

In [None]:
def convert_video_to_25_fps(video):
    cmd = "ffmpeg -i {} -r 25 -y tmp.mp4".format(video)
    os.system(cmd)
    cmd = "mv tmp.mp4 {}".format(video)
    os.system(cmd)
    return

In [None]:
video_to_test = "test/unsynced.mp4"

In [None]:
convert_video_to_25_fps(video_to_test)

In [None]:
lip_input = get_syncnet_lip_model_input(video_to_test)
print(lip_input.shape)

In [None]:
audio_input = get_syncnet_audio_model_input(video_to_test)
print(audio_input.shape)

# 2. Load the Syncnet lip and audio models

In [None]:
version = 'v4'
mode = 'both'
syncnet_audio_model, syncnet_lip_model = load_pretrained_syncnet_model(version=version, mode=mode, verbose=False)

In [None]:
# print(syncnet_audio_model.summary())

In [None]:
# print(syncnet_lip_model.summary())

# 3. Calculate lip-video and audio embeddings using Syncnet

In [None]:
audio_embeddings = syncnet_audio_model.predict(audio_input)
print(audio_embeddings.shape)

In [None]:
lip_embeddings = syncnet_lip_model.predict(lip_input)
print(lip_embeddings.shape)

# 4. Calculate Euclidian distance between the lip and audio embeddings to check if video/audio are in sync

1. Pass the audio frame through the audio model to get its encoding (a 128-dimensional feature), pass the video frame through the lip model to get its encoding (a 128-dimensional features)

2. Check the euclidean distance between the audio encoding and the video encoding.

3. If the distance is greater than a threshold, then it is said that audio frame and that video frame are not in sync.

In [None]:
def euclidian_distance_(np_data_1, np_data_2): 
    dist = np.sqrt( np.sum(np.square(np_data_1 - np_data_2), axis=-1) )
    return dist

In [None]:
distance = euclidian_distance(audio_embeddings, lip_embeddings)
print(distance)