# Syncnet Implementation

https://github.com/voletiv/syncnet-in-keras/

In [63]:
import cv2, os, sys, numpy as np
import scipy.io.wavfile as wav
from PIL import Image
import numpy as np
import speechpy
import dlib
import tqdm

Mouth detection from https://github.com/voletiv/lipreading-in-the-wild-experiments/tree/master/process-lrw


In [64]:
from process_lrw_functions import detect_mouth_in_frame, extract_audio_from_mp4
from syncnet_functions import load_pretrained_syncnet_model

In [65]:
def get_video_input(video):

    detector = dlib.get_frontal_face_detector()
    predictor = dlib.shape_predictor("shape_predictor_68_face_landmarks.dat")

    cap         = cv2.VideoCapture(video)
    frameFPS    = int(cap.get(cv2.CAP_PROP_FPS))
    frameCount  = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    frameWidth  = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    frameHeight = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))

    print("FPS: {}".format(frameFPS))
    print("Frames: {}".format(frameCount))
    print("Width: {}".format(frameWidth))
    print("Height: {}".format(frameHeight))

    face = dlib.rectangle(30, 30, 220, 220)

    lip_model_input = []

    frame_count = 0

    while(cap.isOpened()):

        # If frames are extracted from video, all frames are read
        frames = []
        for i in range(5):
        
            # print("Frame", frame_count+1, "of", frameCount, end="\r")
            _, frame = cap.read()
            frame_count += 1
            if(frame is None):
                break

            mouth, face = detect_mouth_in_frame(
                frame, detector, predictor,
                prevFace=face,
                verbose=False)

            mouth = cv2.cvtColor(mouth, cv2.COLOR_BGR2GRAY) # convert to grayscale
            mouth = cv2.resize( mouth, (112,112))
            # mouth = mouth[:, :,0] 	# drop the RGB channel
            frames.append(mouth)

        if len(frames) == 5:
            stacked = np.stack(frames, axis=-1)	#syncnet requires (112,112,5)
            # input(stacked.shape)
            lip_model_input.append(stacked)
        else:
            break

    return np.array(lip_model_input)

# MFCC code thanks to michiyosony 

https://github.com/voletiv/syncnet-in-keras/issues/1#issuecomment-380149724



In [66]:
EACH_MFCC_OUTPUT_FRAME_SIZE = 20

def extract_mfcc_series(wav_file, target_dir=None):
    (rate, sig) = wav.read(wav_file)
    print("Sig length: {}".format(len(sig)))
    try:
        mfcc_feat = speechpy.feature.mfcc(sig, sampling_frequency=rate, frame_length=0.010, frame_stride=0.01)
    except IndexError:
        print("index error occurred while extracting mfcc")
        return
    print('sample_rate: {}, mfcc_feat length: {}, mfcc_feat[0] length: {}'.format(rate, len(mfcc_feat), len(mfcc_feat[0])))
    num_output = len(mfcc_feat) // EACH_MFCC_OUTPUT_FRAME_SIZE
    
    print(mfcc_feat.shape)
    print(int(num_output))
    images = []

    for index in tqdm.tqdm(range(num_output)):
        img = Image.new('RGB', (20, 13), "black")
        pixels = img.load()
        for i in range(img.size[0]):
            for j in range(img.size[1]):
                frame_index = index * EACH_MFCC_OUTPUT_FRAME_SIZE + i
                # print(frame_index)
                try:
                    if mfcc_feat[frame_index][j] < 0:
                        red_amount = min(255, 255 * (mfcc_feat[frame_index][j] / -20))
                        pixels[i, j] = (int(red_amount), 0, 0)
                    elif (mfcc_feat[frame_index][j] > 0):
                        blue_amount = min(255, 255 * (mfcc_feat[frame_index][j] / 20))
                        pixels[i, j] = (0, 0, int(blue_amount))
                except IndexError:
                    print("index error occurred while extracting mfcc @ " + str(frame_index) + "," + str(j))
                    break
        # img.save("{}/mfcc_{:03d}.png".format(target_dir, index), 'PNG')
        img_to_np = np.array(img)

        # Convert to grayscale
        gray_image = cv2.cvtColor(img_to_np, cv2.COLOR_RGB2GRAY)
        
        # Expand an axis
        gray_image_exp = np.expand_dims(gray_image, axis=-1)

        images.append(gray_image_exp)

    return np.asarray(images)


def get_audio_input(video):
    audio_out = "{}.wav".format(video)
    cmd="ffmpeg -y -loglevel panic -i {} -acodec pcm_s16le -ac 1 -ar 16000 {}".format(video, audio_out)
    os.system(cmd)
    return extract_mfcc_series(audio_out)

# Getting the inputs to the models

**Make sure video is of 25fps!**
If not, use the following ffmpeg command to convert fps:

```
ffmpeg -i video.mp4 -r 25 -y video_at_25_fps
.mp4
```

In [113]:
def convert_video_to_25_fps(video):
    cmd="ffmpeg -i {} -r 25 -y tmp.mp4".format(video)
    os.system(cmd)
    cmd="mv tmp.mp4 {}".format(video)
    os.system(cmd)
    return

In [123]:
video_to_test = "test/unsynced.mp4"

In [124]:
convert_video_to_25_fps(video_to_test)

In [125]:
lip_input = get_video_input(video_to_test)
print(lip_input.shape)

FPS: 25
Frames: 184
Width: 320
Height: 240
(36, 112, 112, 5)


In [126]:
audio_input = get_audio_input(video_to_test)
print(audio_input.shape)

Sig length: 117077
sample_rate: 16000, mfcc_feat length: 731, mfcc_feat[0] length: 13
(731, 13)
36


100%|██████████| 36/36 [00:00<00:00, 946.64it/s]


(36, 13, 20, 1)


In [118]:
version = 'v4'
mode = 'both'
syncnet_audio_model, syncnet_lip_model = load_pretrained_syncnet_model(version=version, mode=mode, verbose=False)

In [None]:
print(syncnet_audio_model.summary())

In [None]:
print(syncnet_lip_model.summary())

# Calculate embedding Euclidian distance to see if video / audio is synced

1. Pass the audio frame through the audio model to get its encoding (a 128-dimensional feature), pass the video frame through the lip model to get its encoding (a 128-dimensional features)

2. Check the euclidean distance between the audio encoding and the video encoding.

3. If the distance is greater than a threshold (say, 0.6), then it is said the audio and video are not in sync.

In [127]:
audio_embeddings = syncnet_audio_model.predict(audio_input)
print(audio_embeddings.shape)

(36, 128)


In [128]:
lip_embeddings = syncnet_lip_model.predict(lip_input)
print(lip_embeddings.shape)

(36, 128)


In [129]:
def euclidian_distance(np_data_1, np_data_2): 
    dist = np.linalg.norm(np_data_1-np_data_2)
    return dist

In [130]:
distance_float = euclidian_distance(audio_embeddings, lip_embeddings)


In [110]:
print(distance_float)

55.3365


In [111]:
def euclidian_distance_N(np_data_1, np_data_2): 
    dist = np.sqrt( np.sum(np.square(np.subtract(np_data_1, np_data_2)), axis=-1) )
    return dist

In [112]:
distance_np = euclidian_distance_N(audio_embeddings, lip_embeddings)

print(distance_np)

[ 14.7312603   10.34830475   9.46382904   9.38288784  12.34794044
   8.36612606   6.31382036   9.10655499   9.18782806   8.30923367
  12.19418621   7.3413868    8.76842499   5.95582867   9.76344299
   9.59012699   5.26596785  10.03675079   8.43529987   5.86464643
   6.0869956    6.87394238  10.75918579  12.94858932   4.79319859
  10.19445896   6.23314524  13.28144073  12.46162987   9.89130306
   7.08326292   5.84677649   9.76580238   7.60681677   6.09657145
   5.08543587   4.33298397   6.56058168]
