# Syncnet Implementation

https://github.com/voletiv/syncnet-in-keras/

In [11]:
import cv2, os, sys, numpy as np
import scipy.io.wavfile as wav
from PIL import Image
import numpy as np
import speechpy
import dlib

Mouth detection from https://github.com/voletiv/lipreading-in-the-wild-experiments/tree/master/process-lrw

In [12]:

from process_lrw_functions import detect_mouth_in_frame, extract_audio_from_mp4
from syncnet_functions import load_pretrained_syncnet_model

In [13]:
def get_video_input(video):

	detector = dlib.get_frontal_face_detector()
	predictor = dlib.shape_predictor("shape_predictor_68_face_landmarks.dat")

	cap 		= cv2.VideoCapture(video)
	frameFPS 	= int(cap.get(cv2.CAP_PROP_FPS))
	frameCount 	= int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
	frameWidth 	= int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
	frameHeight = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))

	# print("FPS: {}".format(frameFPS))
	# print("Frames: {}".format(frameCount))
	# print("Width: {}".format(frameWidth))
	# print("Height: {}".format(frameHeight))
	
	face = dlib.rectangle(30, 30, 220, 220)
	
	lip_model_input = []

	
	while(cap.isOpened()):

		# If frames are extracted from video, all frames are read
		frames = []
		for i in range(5):
			_, frame 	= cap.read()
			if(frame is None):
				break

			mouth, face = detect_mouth_in_frame(
				frame, detector, predictor,
				prevFace=face,
				verbose=False)

			mouth = cv2.cvtColor(mouth, cv2.COLOR_BGR2GRAY) # convert to grayscale
			mouth = cv2.resize( mouth, (112,112))
			# mouth = mouth[:, :,0] 	# drop the RGB channel
			frames.append(mouth)

		
		if(len(frames)==0):
			break

		stacked = np.stack(frames ,axis=2)	#syncnet requires (112,112,5)
		# input(stacked.shape)
		lip_model_input.append(stacked)
	
	return np.asarray(lip_model_input)

# MFCC code thanks to michiyosony 

https://github.com/voletiv/syncnet-in-keras/issues/1#issuecomment-380149724



In [14]:
EACH_MFCC_OUTPUT_FRAME_SIZE = 20

def extract_mfcc_series(wav_file, target_dir=None):
	(rate, sig) = wav.read(wav_file)

	try:
#         mfcc_feat = speechpy.feature.mfcc(sig, sampling_frequency=rate, frame_length=0.010, frame_stride=0.01)
        mfcc_feat = speechpy.feature.mfcc(sig, sampling_frequency=rate)
	except IndexError:
		print("index error occurred while extracting mfcc")
		return
	print('sample_rate: {}, mfcc_feat length: {}, mfcc_feat[0] length: {}'.format(rate, len(mfcc_feat), len(mfcc_feat[0])))
	num_output = len(mfcc_feat) / EACH_MFCC_OUTPUT_FRAME_SIZE
	num_output += 1 if (len(mfcc_feat) % EACH_MFCC_OUTPUT_FRAME_SIZE > 0) else 0
	
	# print(mfcc_feat.shape)
	# input(int(num_output))
	images = []

	for index in range(int(num_output)):
		img = Image.new('L', (20, 13), "black")
		pixels = img.load()
		for i in range(img.size[0]):
			for j in range(img.size[1]):
				frame_index = index * EACH_MFCC_OUTPUT_FRAME_SIZE + i
				# print(frame_index)
				try:
					if mfcc_feat[frame_index][j] < 0:
						red_amount = min(255, 255 * (mfcc_feat[frame_index][j] / -20))
						pixels[i, j] = (int(red_amount), 0, 0)
					elif (mfcc_feat[frame_index][j] > 0):
						blue_amount = min(255, 255 * (mfcc_feat[frame_index][j] / 20))
						pixels[i, j] = (0, 0, int(blue_amount))
				except IndexError:
					print("index error occurred while extracting mfcc @ " + str(frame_index) + "," + str(j))
					break
		# img.save("{}/mfcc_{:03d}.png".format(target_dir, index), 'PNG')
		
		img_to_np = np.array(img)
		# img_to_np = img_to_np[:,:,0]
        # print(img_to_np.shape)

		images.append(img_to_np)

	return np.asarray(images)


def get_audio_input(video):

	audio_out = "{}.wav".format(video)
	cmd="ffmpeg -y -loglevel panic -i {} -acodec pcm_s16le -ac 1 -ar 16000 {}".format(video, audio_out)
	os.system(cmd)
	
	return extract_mfcc_series(audio_out)


TabError: inconsistent use of tabs and spaces in indentation (<ipython-input-14-87a384c205b2>, line 8)

Code below - For some reason, i get this error on different videos

> ValueError: could not broadcast input array from shape (112,112,5) into shape (112,112)

In [15]:
lip_input = get_video_input("test.mp4")
print(lip_input.shape)

(23, 112, 112, 5)


In [16]:
audio_input = get_audio_input("test.mp4")
print(audio_input.shape)

sample_rate: 16000, mfcc_feat length: 768, mfcc_feat[0] length: 13
index error occurred while extracting mfcc @ 768,0
index error occurred while extracting mfcc @ 769,0
index error occurred while extracting mfcc @ 770,0
index error occurred while extracting mfcc @ 771,0
index error occurred while extracting mfcc @ 772,0
index error occurred while extracting mfcc @ 773,0
index error occurred while extracting mfcc @ 774,0
index error occurred while extracting mfcc @ 775,0
index error occurred while extracting mfcc @ 776,0
index error occurred while extracting mfcc @ 777,0
index error occurred while extracting mfcc @ 778,0
index error occurred while extracting mfcc @ 779,0
(39, 13, 20, 3)


In [17]:
version = 'v4'
mode = 'both'
syncnet_audio_model, syncnet_lip_model = load_pretrained_syncnet_model(version=version, mode=mode, verbose=False)


In [None]:
print(syncnet_audio_model.summary())

In [None]:
print(syncnet_lip_model.summary())

In [18]:
# print(len(outputs))

def get_layer_by_name(the_model, layer_name):
    return_layer =None
    for layer in the_model.layers:
        config = layer.get_config()
        if(config["name"] is layer_name):
            return_layer = layer
    return return_layer


def euclidian_distance(np_data_1, np_data_2):
    return np.sqrt(np.sum(np.square(np.subtract(np_data_1, np_data_2))))

## Getting model layer output

https://stackoverflow.com/questions/41711190/keras-how-to-get-the-output-of-each-layer

In [19]:
from keras import backend as K

def get_layer_output(model, layer, input_data):
    inp = model.input     
    output = layer.output
    layer_fcn = K.function([inp]+ [K.learning_phase()], [ output ])
    return layer_fcn([ input_data , 1.])

# Calculate embedding Euclidian distance to see if video / audio is synced

1. Pass the audio frame through the audio model to get its encoding (a 256-dimensional feature), pass the video frame through the lip model to get its encoding (a 256-dimensional features)

2. Check the euclidean distance between the audio encoding and the video encoding.

3. If the distance is greater than a threshold (say, 0.6), then it is said the audio and video are not in sync.

In [20]:
audio_256d_encoding_layer = get_layer_by_name(syncnet_audio_model, "fc6_audio")
audio_256d_encoding = get_layer_output(syncnet_audio_model, audio_256d_encoding_layer, audio_input)

ValueError: Cannot feed value of shape (39, 13, 20, 3) for Tensor 'conv1_audio_input_1:0', which has shape '(?, 13, 20, 1)'

In [None]:
lip_256d_encoding_layer= get_layer_by_name(syncnet_lip_model, "fc6_lip")
lip_256d_encoding = get_layer_output(syncnet_lip_model, lip_256d_encoding_layer, lip_input)


In [None]:
distace = euclidian_distance(audio_256d_encoding, lip_256d_encoding)
distance

In [None]:
# audio_prediction = syncnet_audio_model.predict(audio_input)
# lip_prediction = syncnet_lip_model.predict(lip_input)

# print(audio_prediction)
# input(">")
# print(lip_prediction)