# Syncnet Implementation

https://github.com/voletiv/syncnet-in-keras/

In [1]:
import cv2, os, sys, numpy as np
import scipy.io.wavfile as wav
from PIL import Image
import numpy as np
import speechpy
import dlib
import tqdm

Mouth detection from https://github.com/voletiv/lipreading-in-the-wild-experiments/tree/master/process-lrw


In [2]:
from process_lrw_functions import detect_mouth_in_frame, extract_audio_from_mp4
from syncnet_functions import load_pretrained_syncnet_model

Using TensorFlow backend.


In [3]:
def get_video_input(video):

    detector = dlib.get_frontal_face_detector()
    predictor = dlib.shape_predictor("shape_predictor_68_face_landmarks.dat")

    cap         = cv2.VideoCapture(video)
    frameFPS    = int(cap.get(cv2.CAP_PROP_FPS))
    frameCount  = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    frameWidth  = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    frameHeight = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))

    print("FPS: {}".format(frameFPS))
    print("Frames: {}".format(frameCount))
    print("Width: {}".format(frameWidth))
    print("Height: {}".format(frameHeight))

    face = dlib.rectangle(30, 30, 220, 220)

    lip_model_input = []

    frame_count = 0

    while(cap.isOpened()):

        # If frames are extracted from video, all frames are read
        frames = []
        for i in range(5):
        
            # print("Frame", frame_count+1, "of", frameCount, end="\r")
            _, frame = cap.read()
            frame_count += 1
            if(frame is None):
                break

            mouth, face = detect_mouth_in_frame(
                frame, detector, predictor,
                prevFace=face,
                verbose=False)

            mouth = cv2.cvtColor(mouth, cv2.COLOR_BGR2GRAY) # convert to grayscale
            mouth = cv2.resize( mouth, (112,112))
            # mouth = mouth[:, :,0] 	# drop the RGB channel
            frames.append(mouth)

        if len(frames) == 5:
            stacked = np.stack(frames, axis=-1)	#syncnet requires (112,112,5)
            # input(stacked.shape)
            lip_model_input.append(stacked)
        else:
            break

    return np.array(lip_model_input)

# MFCC code thanks to michiyosony 

https://github.com/voletiv/syncnet-in-keras/issues/1#issuecomment-380149724



In [4]:
EACH_MFCC_OUTPUT_FRAME_SIZE = 20

def extract_mfcc_series(wav_file, target_dir=None):
    (rate, sig) = wav.read(wav_file)
    print("Sig length: {}".format(len(sig)))
    try:
        mfcc_feat = speechpy.feature.mfcc(sig, sampling_frequency=rate, frame_length=0.010, frame_stride=0.01)
    except IndexError:
        print("index error occurred while extracting mfcc")
        return
    print('sample_rate: {}, mfcc_feat length: {}, mfcc_feat[0] length: {}'.format(rate, len(mfcc_feat), len(mfcc_feat[0])))
    num_output = len(mfcc_feat) // EACH_MFCC_OUTPUT_FRAME_SIZE
    
    print(mfcc_feat.shape)
    print(int(num_output))
    images = []

    for index in tqdm.tqdm(range(num_output)):
        img = Image.new('RGB', (20, 13), "black")
        pixels = img.load()
        for i in range(img.size[0]):
            for j in range(img.size[1]):
                frame_index = index * EACH_MFCC_OUTPUT_FRAME_SIZE + i
                # print(frame_index)
                try:
                    if mfcc_feat[frame_index][j] < 0:
                        red_amount = min(255, 255 * (mfcc_feat[frame_index][j] / -20))
                        pixels[i, j] = (int(red_amount), 0, 0)
                    elif (mfcc_feat[frame_index][j] > 0):
                        blue_amount = min(255, 255 * (mfcc_feat[frame_index][j] / 20))
                        pixels[i, j] = (0, 0, int(blue_amount))
                except IndexError:
                    print("index error occurred while extracting mfcc @ " + str(frame_index) + "," + str(j))
                    break
        # img.save("{}/mfcc_{:03d}.png".format(target_dir, index), 'PNG')
        img_to_np = np.array(img)

        # Convert to grayscale
        gray_image = cv2.cvtColor(img_to_np, cv2.COLOR_RGB2GRAY)
        
        # Expand an axis
        gray_image_exp = np.expand_dims(gray_image, axis=-1)

        images.append(gray_image_exp)

    return np.asarray(images)


def get_audio_input(video):
    audio_out = "{}.wav".format(video)
    cmd="ffmpeg -y -loglevel panic -i {} -acodec pcm_s16le -ac 1 -ar 16000 {}".format(video, audio_out)
    os.system(cmd)
    return extract_mfcc_series(audio_out)

# Getting the inputs to the models

**Make sure video is of 25fps!**
If not, use the following ffmpeg command to convert fps:

```
ffmpeg -i video.mp4 -r 25 -y video_at_25_fps
.mp4
```

In [5]:
def convert_video_to_25_fps(video):
    cmd="ffmpeg -i {} -r 25 -y tmp.mp4".format(video)
    os.system(cmd)
    cmd="mv tmp.mp4 {}".format(video)
    os.system(cmd)
    return

In [6]:
video_to_test = "test/unsynced.mp4"

In [7]:
convert_video_to_25_fps(video_to_test)

In [8]:
lip_input = get_video_input(video_to_test)
print(lip_input.shape)

FPS: 25
Frames: 184
Width: 320
Height: 240
(36, 112, 112, 5)


In [9]:
audio_input = get_audio_input(video_to_test)
print(audio_input.shape)

Sig length: 117077
sample_rate: 16000, mfcc_feat length: 731, mfcc_feat[0] length: 13
(731, 13)
36


100%|██████████| 36/36 [00:00<00:00, 1284.28it/s]


(36, 13, 20, 1)


In [10]:
version = 'v4'
mode = 'both'
syncnet_audio_model, syncnet_lip_model = load_pretrained_syncnet_model(version=version, mode=mode, verbose=False)

In [11]:
print(syncnet_audio_model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1_audio (Conv2D)         (None, 13, 20, 64)        640       
_________________________________________________________________
bn1_audio (BatchNormalizatio (None, 13, 20, 64)        256       
_________________________________________________________________
relu1_audio (Activation)     (None, 13, 20, 64)        0         
_________________________________________________________________
conv2_audio (Conv2D)         (None, 13, 20, 128)       73856     
_________________________________________________________________
bn2_audio (BatchNormalizatio (None, 13, 20, 128)       512       
_________________________________________________________________
relu2_audio (Activation)     (None, 13, 20, 128)       0         
_________________________________________________________________
pool2_audio (MaxPooling2D)   (None, 11, 9, 128)        0         
__________

In [12]:
print(syncnet_lip_model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1_lip (Conv2D)           (None, 110, 110, 96)      4416      
_________________________________________________________________
bn1_lip (BatchNormalization) (None, 110, 110, 96)      384       
_________________________________________________________________
relu1_lip (Activation)       (None, 110, 110, 96)      0         
_________________________________________________________________
pool1_lip (MaxPooling2D)     (None, 54, 54, 96)        0         
_________________________________________________________________
conv2_lip (Conv2D)           (None, 50, 50, 256)       614656    
_________________________________________________________________
bn2_lip (BatchNormalization) (None, 50, 50, 256)       1024      
_________________________________________________________________
relu2_lip (Activation)       (None, 50, 50, 256)       0         
__________

# Calculate embedding Euclidian distance to see if video / audio is synced

1. Pass the audio frame through the audio model to get its encoding (a 128-dimensional feature), pass the video frame through the lip model to get its encoding (a 128-dimensional features)

2. Check the euclidean distance between the audio encoding and the video encoding.

3. If the distance is greater than a threshold (say, 0.6), then it is said the audio and video are not in sync.

In [13]:
audio_embeddings = syncnet_audio_model.predict(audio_input)
print(audio_embeddings.shape)

(36, 128)


In [14]:
lip_embeddings = syncnet_lip_model.predict(lip_input)
print(lip_embeddings.shape)

(36, 128)


In [15]:
def euclidian_distance(np_data_1, np_data_2): 
    
    
    if( np_data_1.shape != np_data_2.shape):
        print("==> Dimensions don't match {} {}. Clipping".format(np_data_1.shape, np_data_2.shape))
        min_dim = min(np_data_1.shape[0],  np_data_2.shape[0])

        np_data_1 = np_data_1[:min_dim,:]
        np_data_2 = np_data_2[:min_dim,:]

    dist = np.linalg.norm(np_data_1-np_data_2)
    return dist

In [16]:
distance_float = euclidian_distance(audio_embeddings, lip_embeddings)


In [17]:
print(distance_float)

47.7346


In [18]:
def euclidian_distance_N(np_data_1, np_data_2): 
    dist = np.sqrt( np.sum(np.square(np.subtract(np_data_1, np_data_2)), axis=-1) )
    return dist

In [19]:
distance_np = euclidian_distance_N(audio_embeddings, lip_embeddings)

print(distance_np)

[ 14.6083622    6.9940815    6.94163227   6.37897682   4.63438559
   4.05189753   9.52804565   8.16981983   8.11581326   7.10538149
   9.50084972   7.13821268   6.25484085   4.89169073   3.47015381
   4.1924243    6.65998745  11.94016838   9.45641804  10.38359356
  10.20247841   7.65069389   7.0541501    7.41728783   9.06472588
   5.98884869   7.62462854   6.5692277    4.89220715  11.58408165
   7.99294329  11.91477299   7.73058414   4.51798201   6.656991
   4.72928667]


In [21]:
from os import listdir
from os.path import isfile, join

test_path="test/"
for f in listdir(test_path):
    tfile=join(test_path, f)
    if(isfile(tfile) and f.startswith("bad-dub") and f.endswith(".mp4")):

        print("*" * 30)
        print(tfile)
        
        convert_video_to_25_fps(tfile)

        lip_input = get_video_input(tfile)
        print(lip_input.shape)

        audio_input = get_audio_input(tfile)
        print(audio_input.shape)

        audio_embeddings = syncnet_audio_model.predict(audio_input)
        print(audio_embeddings.shape)

        lip_embeddings = syncnet_lip_model.predict(lip_input)
        print(lip_embeddings.shape)

        distance_float = euclidian_distance(audio_embeddings, lip_embeddings)

        print("Distance: {}".format(distance_float))



******************************
test/bad-dub-01.mp4
FPS: 25
Frames: 37
Width: 320
Height: 240
(7, 112, 112, 5)
Sig length: 22528
sample_rate: 16000, mfcc_feat length: 140, mfcc_feat[0] length: 13
(140, 13)
7


100%|██████████| 7/7 [00:00<00:00, 999.32it/s]


(7, 13, 20, 1)
(7, 128)
(7, 128)
Distance: 22.37862777709961
******************************
test/bad-dub-02.mp4
FPS: 25
Frames: 69
Width: 320
Height: 240
(13, 112, 112, 5)
Sig length: 43008
sample_rate: 16000, mfcc_feat length: 268, mfcc_feat[0] length: 13
(268, 13)
13


100%|██████████| 13/13 [00:00<?, ?it/s]


(13, 13, 20, 1)
(13, 128)
(13, 128)
Distance: 29.282217025756836
******************************
test/bad-dub-03.mp4
FPS: 25
Frames: 134
Width: 320
Height: 240
(26, 112, 112, 5)
Sig length: 84651
sample_rate: 16000, mfcc_feat length: 529, mfcc_feat[0] length: 13
(529, 13)
26


100%|██████████| 26/26 [00:00<00:00, 1299.16it/s]


(26, 13, 20, 1)
(26, 128)
(26, 128)
Distance: 43.84505844116211
******************************
test/bad-dub-04.mp4
FPS: 25
Frames: 89
Width: 320
Height: 240
(17, 112, 112, 5)
Sig length: 56320
sample_rate: 16000, mfcc_feat length: 352, mfcc_feat[0] length: 13
(352, 13)
17


100%|██████████| 17/17 [00:00<00:00, 3342.54it/s]


(17, 13, 20, 1)
(17, 128)
(17, 128)
Distance: 35.79926300048828
******************************
test/bad-dub-05.mp4
FPS: 25
Frames: 42
Width: 320
Height: 240
(8, 112, 112, 5)
Sig length: 25941
sample_rate: 16000, mfcc_feat length: 162, mfcc_feat[0] length: 13
(162, 13)
8


100%|██████████| 8/8 [00:00<?, ?it/s]


(8, 13, 20, 1)
(8, 128)
(8, 128)
Distance: 25.937734603881836
******************************
test/bad-dub-06.mp4
FPS: 25
Frames: 107
Width: 320
Height: 240
(21, 112, 112, 5)
Sig length: 67925
sample_rate: 16000, mfcc_feat length: 424, mfcc_feat[0] length: 13
(424, 13)
21


100%|██████████| 21/21 [00:00<00:00, 671.92it/s]


(21, 13, 20, 1)
(21, 128)
(21, 128)
Distance: 35.53982925415039
******************************
test/bad-dub-07.mp4
FPS: 25
Frames: 119
Width: 320
Height: 240
(23, 112, 112, 5)
Sig length: 75776
sample_rate: 16000, mfcc_feat length: 473, mfcc_feat[0] length: 13
(473, 13)
23


100%|██████████| 23/23 [00:00<00:00, 717.23it/s]


(23, 13, 20, 1)
(23, 128)
(23, 128)
Distance: 40.302101135253906
******************************
test/bad-dub-multispeaker.mp4
FPS: 25
Frames: 545
Width: 320
Height: 240
(109, 112, 112, 5)
Sig length: 347477
sample_rate: 16000, mfcc_feat length: 2171, mfcc_feat[0] length: 13
(2171, 13)
108


100%|██████████| 108/108 [00:00<00:00, 1298.85it/s]


(108, 13, 20, 1)
(108, 128)
(109, 128)
==> Dimensions don't match (108, 128) (109, 128). Clipping
Distance: 81.50373077392578
