# Milestone 3

Now you can compute a feature vector for a single image, which is the cropped face from a video frame.

The goal now is, **for each video and for each frame** of the video, to **detect the face** in the video, **compute the features** for that face, and save the resulted feature on disk in **HDF5 file**. 

You should have one HDF5 file for each video. 

\\
The file will contain a **matrix** with the number of **rows** equal to the number of **frames** in that video and the number of **columns** equal to the number of **features** you compute for a single face.

The **HDF5 files should be saved in the same directory structure that the video database has**, but instead of videos you will have HDF5 files with features.

To loop through the videos inside a directory, you can use standard python routines for recursively traversing the directory.

In [None]:
import os
import tarfile
import numpy as np
import glob
import os
import cv2

# some settings to make it smoothly runnable in Jupyter
os.environ['KMP_DUPLICATE_LIB_OK']='True'

%matplotlib inline
import matplotlib.pyplot as plt
from tqdm import tqdm

In [None]:
!wget https://liveproject-resources.s3.amazonaws.com/other/detectingdeepfakes/DeepfakeTIMIT.tar.gz

--2020-07-21 16:08:34--  https://liveproject-resources.s3.amazonaws.com/other/detectingdeepfakes/DeepfakeTIMIT.tar.gz
Resolving liveproject-resources.s3.amazonaws.com (liveproject-resources.s3.amazonaws.com)... 52.216.251.76
Connecting to liveproject-resources.s3.amazonaws.com (liveproject-resources.s3.amazonaws.com)|52.216.251.76|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 226611200 (216M) [application/x-gzip]
Saving to: ‘DeepfakeTIMIT.tar.gz’


2020-07-21 16:08:37 (63.7 MB/s) - ‘DeepfakeTIMIT.tar.gz’ saved [226611200/226611200]



In [None]:
# Extract the files
filename = "DeepfakeTIMIT.tar.gz"
tf = tarfile.open(filename)
tf.extractall()

os.listdir('DeepfakeTIMIT')

['fram1-original.mov',
 'deepfake_images_1.png',
 'lower_quality',
 'deepfake_images_2.png',
 'fadg0-fram1-roi93.mov',
 'higher_quality',
 'fadg0-original.mov',
 '.dircksum',
 'README.txt']

In [None]:
# Download and unzip the VidTIMIT dataset
!wget https://liveproject-resources.s3.amazonaws.com/other/detectingdeepfakes/VidTIMIT.zip

!unzip -q VidTIMIT.zip
os.listdir('VidTIMIT')

--2020-07-21 16:08:45--  https://liveproject-resources.s3.amazonaws.com/other/detectingdeepfakes/VidTIMIT.zip
Resolving liveproject-resources.s3.amazonaws.com (liveproject-resources.s3.amazonaws.com)... 52.216.147.124
Connecting to liveproject-resources.s3.amazonaws.com (liveproject-resources.s3.amazonaws.com)|52.216.147.124|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1358810924 (1.3G) [application/zip]
Saving to: ‘VidTIMIT.zip’


2020-07-21 16:09:18 (40.0 MB/s) - ‘VidTIMIT.zip’ saved [1358810924/1358810924]



['fcmh0',
 'mrgg0',
 'mdbb0',
 'felc0',
 'fcmr0',
 'mrcz0',
 'msjs1',
 'fgjd0',
 'mbdg0',
 'mbjk0',
 'fdac1',
 'mmdb1',
 'mcem0',
 'mdab0',
 'mabw0',
 'mwbt0',
 'fjre0',
 'fdms0',
 'fjas0',
 'fram1',
 'fkms0',
 'mrjo0',
 'fadg0',
 'mmdm2',
 'mpdf0',
 'fcft0',
 'mjar0',
 'mgwt0',
 'mtas1',
 'fpkt0',
 'fedw0',
 'mreb0',
 'mtmr0',
 'faks0',
 'mjsw0',
 'mccs0',
 'mpgl0',
 'mstk0',
 'fcrh0',
 'fdrd1',
 'fjwb0',
 'mdld0',
 'fjem0']

In [None]:
# Real videos
real_dir = 'VidTIMIT'
real_videos_list = glob.glob(real_dir + '/**/*.avi', recursive=True)
print('No. of real videos: {}'.format(len(real_videos_list)))

# Fake videos
# we focus on high quality videos of DeepfakeTIMIT dataset
fake_dir_hq = 'DeepfakeTIMIT/higher_quality'
fake_videos_list = glob.glob(fake_dir_hq + '/**/*.avi', recursive=True)
print('No. of HQ fake videos: {}'.format(len(fake_videos_list)))

No. of real videos: 430
No. of HQ fake videos: 320


In [None]:
# Sort the lists of videos
real_videos_list = sorted(real_videos_list)
fake_videos_list = sorted(fake_videos_list)

In [None]:
# How many frames per video?
def count_video_frames(video_path):
    cap = cv2.VideoCapture(video_path)
    num_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    return num_frames

In [None]:
# Num frames REAL
f = [count_video_frames(video) for video in real_videos_list]
print('Minimum number of frames in real videos:', min(f))

Minimum number of frames in real videos: 54


In [None]:
# Num frames FAKE
f = [count_video_frames(video) for video in fake_videos_list]
print('Minimum number of frames in fake videos:', min(f))

Minimum number of frames in fake videos: 54


## Detect faces and extract features

In [None]:
# Install the face detector
!pip install mtcnn

Collecting mtcnn
[?25l  Downloading https://files.pythonhosted.org/packages/67/43/abee91792797c609c1bf30f1112117f7a87a713ebaa6ec5201d5555a73ef/mtcnn-0.1.0-py3-none-any.whl (2.3MB)
[K     |████████████████████████████████| 2.3MB 2.8MB/s 
Installing collected packages: mtcnn
Successfully installed mtcnn-0.1.0


In [None]:
from mtcnn import MTCNN

In [None]:
def detect_face(image, box_scale=0.15):
    image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    detection_result = detector.detect_faces(image_rgb)
    # get a box around the face
    if len(detection_result) > 0:  # if a face is detected
        bounding_box = detection_result[0]['box']
        # enlarge detection box by the given scale
        x = int(bounding_box[0] - box_scale * bounding_box[2])
        y = int(bounding_box[1] - box_scale * bounding_box[3])
        w = int(bounding_box[2] + box_scale * bounding_box[2] * 2)
        h = int(bounding_box[3] + box_scale * bounding_box[3] * 2)
        # crop a face
        return image[y:y+h, x:x+w, :].copy()
    return None

In [None]:
# Compute features

# Note that the number of bins we use for the histogram is a parameter of the system
# more bins - more features
def compute_hist(image, num_bins=64):
    hist, bins = np.histogram(image.ravel(), num_bins, [0,256], density=True)
    return hist

import skimage.metrics
num_hist_bins = 64

def compute_blurred_image(image, kernel_size=3, sigma=0.5):
    return cv2.GaussianBlur(image, (kernel_size, kernel_size), sigma)

def mse(x, y):
    return skimage.metrics.normalized_root_mse(x, y)

def psnr(x, y):
    return skimage.metrics.peak_signal_noise_ratio(x, y, data_range=255)

def ssim(x, y):
    return skimage.metrics.structural_similarity(x, y, multichannel=True, 
                                                 gaussian_weights=True, sigma=1.5, 
                                                 use_sample_covariance=False, data_range=255)

def compute_features(image):
    image_blurred = compute_blurred_image(image)
    im_ssim = ssim(image, image_blurred)
    im_mse = mse(image, image_blurred)
    im_psnr = psnr(image, image_blurred)
    im_hist = compute_hist(image, num_bins=num_hist_bins)
    features = np.concatenate([[im_ssim], [im_mse], [im_psnr], im_hist])
    return features

In [None]:
  def detect_and_extract_features(video_path, box_scale=0.15, limit_faces=-1):
    detector = MTCNN()
    
    video_features = np.zeros((54, 67)) # 54 frames, 67 features  
    cap = cv2.VideoCapture(video_path)

    num_frames = 54

    for frame_no in range(num_frames):
        # if the given limit is not -1, loop only until the limit
        if limit_faces != -1 and frame_no >= limit_faces:
            break
        
        cap.set(cv2.CAP_PROP_POS_FRAMES, frame_no)
        ret, frame = cap.read()
        
        # detect faces
        face = detect_face(frame, box_scale=box_scale)
        if face is not None:
            video_features[frame_no] = compute_features(face)
    
    video_features = np.reshape(video_features, (1, 54*67 ))

    return video_features


In [None]:
# extract features of all REAL videos
real_features_mx = np.zeros((len(real_videos_list), 54*67))

for i in tqdm(range(len(real_videos_list))):
  real_features_mx[i] = detect_and_extract_features(real_videos_list[i])

100%|██████████| 430/430 [1:13:56<00:00, 10.32s/it]


In [None]:
# extract features of all FAKE videos
fake_features_mx = np.zeros((len(fake_videos_list), 54*67))

for i in tqdm(range(len(fake_videos_list))):
  fake_features_mx[i] = detect_and_extract_features(fake_videos_list[i])

100%|██████████| 320/320 [54:10<00:00, 10.16s/it]


In [None]:
import h5py

In [None]:
# Save features as HDF5 files
# Real videos
hdf5_path = 'real_videos_features.h5'
with h5py.File(hdf5_path, 'w') as hf: 
    hf.create_dataset(name='real_features', data=real_features_mx)

# Fake videos
hdf5_path = 'fake_videos_features.h5'
with h5py.File(hdf5_path, 'w') as hf: 
    hf.create_dataset(name='fake_features', data=fake_features_mx)

## Train a SVM classifier

Using scikit-learn train SVM classifier on the features. When reading the features from the saved HDF5 files, you need to also **construct a vector with labels that has 0 label for each Deepfake features and 1 label for each genuine feature.** 

You can use linear SVM and play with different parameters of this classifier and study their impact on the results.

### Workflow

**Split the set of videos into two sets: training and testing.** 

There are different ways to do it but the split of **80%** of data for training and 20% for testing is the common one. 
You can use **train_test_split()** function from sklearn.model_selection.

Be careful how you split the list of videos into 80% for training and 20% for testing. 
You need to make sure that 80% of Deepfake videos are inside the training set and 80% of original videos are also inside the training set. 

Also, you need to **split the videos, not their features** (you have many features vectors for each video); when you evaluate later, you will need to compute one prediction score per test video, which means all features from that videos must be inside the test set. 
You must always evaluate your trained classification model on the features that you did not use for training.

In a loop through all original and deepfake videos (use Python’s Glob to loop through folders), for each video compute features for all frames (loop through frames with OpenCV) in the video and save the features in HDF5 files. 
One HDF5 files should correspond to one video and should contain the feature matrix of N x M, where N is the number of frames in that video and M is the number of features you computed for on frame, so each row is a feature vector for one frame of the video.

Once all features are computed, focus on the training set of videos. Loop through the stored HDF5 files (use the same Glob library) of the training set, read HDF5 files and **combine all the features vectors in one numpy array, where rows are feature vectors from all videos. **

In the same time, **create a separate array of integer labels**, which would have 0 label for the feature vector corresponding to Deepfake frame and label 1 corresponding to original frame. 

In the end, **you should have two arrays:** 
* 1) array of features extracted from all frames of all videos and 
* 2) array of labels of the same length, where you store which feature is from fake video and which is from the original video.

**Train SVM classifier of scikit-learn on the features and labels from the training set.** This trained classifier will be used in the next milestone.


In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn import metrics

In [None]:
y_real = np.ones(len(real_videos_list))
y_fake = np.zeros(len(fake_videos_list))

y = np.concatenate((y_real, y_fake), axis = 0)

In [None]:
y.shape

(750,)

In [None]:
rvf = 'real_videos_features.h5'
fvf = 'fake_videos_features.h5'

In [None]:
hf = h5py.File(rvf, 'r')
hf.keys()

<KeysViewHDF5 ['real_features']>

In [None]:
X_real = hf.get('real_features')
X_real = np.array(X_real)
hf.close()

In [None]:
X_real.shape

(430, 3618)

In [None]:
hf = h5py.File(fvf, 'r')
X_fake = hf.get('fake_features')
X_fake = np.array(X_fake)

X = np.concatenate((X_real, X_fake), axis = 0)

X.shape

(750, 3618)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=7)

print('After splitting:\nNo. of samples in the training set: {}\nNo. of samples in the test set: {}'.format(X_train.shape[0], X_test.shape[0]))

After splitting:
No. of samples in the training set: 600
No. of samples in the test set: 150


Standardize numeric data

In [None]:
 # Standardize numeric data
 # Instantiate the scaler
scaler = StandardScaler()

# Scale the columns
X_train_scaled  = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
from sklearn import svm

In [None]:
model = svm.SVC(kernel='linear')
model.fit(X_train_scaled, y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [None]:
y_test

array([0., 1., 1., 1., 1., 0., 0., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0.,
       1., 1., 1., 0., 0., 1., 0., 0., 1., 0., 0., 0., 1., 0., 0., 1., 0.,
       0., 1., 1., 0., 0., 1., 0., 0., 1., 0., 0., 0., 1., 1., 0., 1., 1.,
       1., 1., 0., 0., 0., 1., 1., 1., 1., 1., 0., 0., 1., 0., 1., 0., 0.,
       1., 1., 0., 0., 1., 0., 0., 0., 0., 0., 1., 0., 0., 0., 1., 1., 0.,
       1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 1., 0., 1., 0., 0., 0., 0.,
       1., 1., 0., 0., 1., 1., 1., 1., 0., 1., 1., 1., 0., 1., 0., 1., 1.,
       1., 1., 0., 1., 1., 1., 1., 0., 0., 1., 1., 1., 0., 0., 0., 1., 0.,
       1., 1., 0., 0., 0., 1., 1., 0., 0., 0., 1., 1., 1., 0.])

In [None]:
y_pred = model.predict(X_test_scaled)
y_pred

array([0., 1., 1., 1., 1., 0., 0., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0.,
       1., 1., 1., 0., 0., 1., 0., 0., 1., 0., 0., 0., 1., 0., 0., 1., 0.,
       0., 1., 1., 0., 0., 1., 0., 0., 1., 0., 0., 0., 1., 1., 0., 1., 1.,
       1., 1., 0., 0., 0., 1., 1., 1., 1., 1., 0., 0., 1., 0., 1., 0., 0.,
       1., 1., 0., 0., 1., 0., 0., 0., 0., 0., 1., 0., 0., 0., 1., 1., 0.,
       1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 1., 0., 1., 0., 0., 0., 0.,
       1., 1., 0., 0., 1., 1., 1., 1., 0., 1., 1., 1., 0., 1., 0., 0., 1.,
       1., 1., 0., 1., 1., 1., 1., 0., 0., 1., 1., 1., 0., 0., 0., 1., 0.,
       1., 1., 0., 1., 0., 1., 1., 0., 0., 0., 1., 1., 0., 0.])

In [None]:
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.98
