This notebook archives the code to process the full data set and save the encoded faces in `/data_processed/`

In [2]:
%%capture
!pip install mmcv

In [1]:
import os
import torch
import glob
import time
import numpy as np
import pandas as pd
import mmcv, cv2
from facenet_pytorch import MTCNN, InceptionResnetV1, extract_face
from PIL import Image, ImageDraw, ImageFont, ImageEnhance
from IPython import display
from tqdm import tqdm
from matplotlib import pyplot as plt

In [2]:
Image.__version__

'6.2.1'

In [3]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print('Running on device: {}'.format(device))

Running on device: cuda:0


In [4]:
# Load face detector
mtcnn = MTCNN(margin=14, keep_all=True, post_process=False, thresholds = [0.9, 0.9, 0.9], device=device).eval()

# Load facial recognition model, but I didn't want to use it yet
resnet = InceptionResnetV1(pretrained='vggface2', device=device).eval()

In [5]:
class DetectionPipeline:
    """Pipeline class for detecting faces in the frames of a video file."""
    
    def __init__(self, detector, n_frames=None, batch_size=60, resize=None):
        """Constructor for DetectionPipeline class.
        
        Keyword Arguments:
            n_frames {int} -- Total number of frames to load. These will be evenly spaced
                throughout the video. If not specified (i.e., None), all frames will be loaded.
                (default: {None})
            batch_size {int} -- Batch size to use with MTCNN face detector. (default: {32})
            resize {float} -- Fraction by which to resize frames from original prior to face
                detection. A value less than 1 results in downsampling and a value greater than
                1 result in upsampling. (default: {None})
        """
        self.detector = detector
        self.n_frames = n_frames
        self.batch_size = batch_size
        self.resize = resize
    
    def __call__(self, filename):
        """Load frames from an MP4 video and detect faces.

        Arguments:
            filename {str} -- Path to video.
        """
        # Create video reader and find length
        v_cap = cv2.VideoCapture(filename)
        v_len = int(v_cap.get(cv2.CAP_PROP_FRAME_COUNT))

        # Pick 'n_frames' evenly spaced frames to sample
        if self.n_frames is None:
            sample = np.arange(1, v_len)
        else:
            sample = np.linspace(1, v_len - 1, self.n_frames).astype(int)

        # Loop through frames
        faces = []
        frames = []
        for j in range(v_len):
            success = v_cap.grab()
            if j in sample:
                # Load frame
                success, frame = v_cap.retrieve()
                if not success:
                    continue
                frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
                max_p = frame.max()
                frame = Image.fromarray(frame)
                if max_p < 150:
                    enhancer = ImageEnhance.Brightness(frame)
                    frame = enhancer.enhance(255/max_p)
                    
                # Resize frame to desired size
                if self.resize is not None:
                    frame = frame.resize([int(d * self.resize) for d in frame.size])
                
                frames.append(frame)

                # When batch is full, detect faces and reset frame list
                if len(frames) % self.batch_size == 0 or j == sample[-1]:
                    faces.extend(self.detector(frames))
                    frames = []

        v_cap.release()

        return faces    


def process_faces(faces, resnet):
    # Filter out frames without faces
    faces = [f for f in faces if f is not None]
    faces = torch.cat(faces).to(device)

    # Generate facial feature vectors using a pretrained model
    embeddings = resnet(faces)

    # Calculate centroid for video and distance of each face's feature vector from centroid
#     centroid = embeddings.mean(dim=0)
#     x = (embeddings - centroid).norm(dim=1).cpu().numpy()
    
    return embeddings

# Sample Videos for speed test
    
* I see 400 videos took minutes 18 minutes on a pair of P100s. There are 348/400 videos with 1 face only, 38 with 2 faces and none with three in every frame. The same dataset took about 13 minutes on a pair of V100s, given the prices differences I will use P100s to process the entire thing. 
    
* An estimate of ETA is 2000 videos/folder * 50 folders * 18 minute/400videos = 4500 minutes or 75 hours. 
    
* However, I noticed that MTCNN only uses one GPU. Therefore I duplicated this workbook so that we have both GPUs running at the same time. Ideally it should half the time and we are looking at 1 day and half. 

In [230]:
# Define face detection pipeline
detection_pipeline = DetectionPipeline(detector=mtcnn, batch_size=60, resize=None, n_frames=45)

# Get all test videos
filenames = glob.glob('data/train_sample_videos/*.mp4')
metadata = pd.read_json('data/train_sample_videos/metadata.json').T

X1 = []
X1_encoded = []
Y1 = []
X2 = []
X2_encoded = []
Y2 = []
X3 = []
X3_encoded = []
Y3 = []
start = time.time()
n_processed = 0
with torch.no_grad():
    for i, filename in tqdm(enumerate(filenames), total=len(filenames)):
        try:
            # Load frames and find faces
            faces = detection_pipeline(filename)
            y = int((metadata.label['data/train_sample_videos/' + metadata.index == filename] == 'REAL') * 1)
            n_faces = [x.shape[0] if x is not None else 0 for x in faces ]
            faces = [x for x in faces if x is not None]
            if n_faces.count(3) >= 30:
                f_faces = [x for x in faces if x.shape[0] == 3]
                f_faces = [f_faces[i] for i in np.linspace(0, len(f_faces)-1, 30).astype(int)]
                X3.append(f_faces)
                X3_encoded.append(process_faces(f_faces, resnet))
                Y3.append(y)
            elif n_faces.count(2) >= 30:
                f_faces = [x for x in faces if x.shape[0] == 2]
                f_faces = [f_faces[i] for i in np.linspace(0, len(f_faces)-1, 30).astype(int)]
                X2.append(f_faces)
                X2_encoded.append(process_faces(f_faces, resnet))
                Y2.append(y)
            elif n_faces.count(1) >= 30:
                f_faces = [x for x in faces if x.shape[0] == 1]
                f_faces = [f_faces[i] for i in np.linspace(0, len(f_faces)-1, 30).astype(int)]
                X1.append(f_faces)
                X1_encoded.append(process_faces(f_faces, resnet))
                Y1.append(y)
            #             # 1 faces ----------
#             if [x.shape[0] for x in faces if x is not None] == [1] * 30:
#                 # Calculate embeddings
#                 X1.append(faces)
#                 X1_encoded.append(process_faces(faces, resnet))
#                 Y1.append(y)
#             # 2 faces ----------   
#             if [x.shape[0] for x in faces if x is not None] == [2] * 30:
#                 # Calculate embeddings
#                 X2.append(faces)
#                 X2_encoded.append(process_faces(faces, resnet))
#                 Y2.append(y)
#             # 3 faces ----------   
#             if [x.shape[0] for x in faces if x is not None] == [3] * 30:
#                 # Calculate embeddings
#                 X3.append(faces)
#                 X3_encoded.append(process_faces(faces, resnet))
#                 Y3.append(y)
        except KeyboardInterrupt:
            print('\nStopped.')
            break

        except Exception as e:
            print(e)
        
        n_processed += len(faces)

print(time.time() - start)











  0%|          | 0/400 [00:00<?, ?it/s][A[A[A[A[A[A[A[A[A[A









  0%|          | 1/400 [00:02<16:59,  2.55s/it][A[A[A[A[A[A[A[A[A[A









  0%|          | 2/400 [00:05<17:32,  2.64s/it][A[A[A[A[A[A[A[A[A[A









  1%|          | 3/400 [00:08<18:13,  2.76s/it][A[A[A[A[A[A[A[A[A[A









  1%|          | 4/400 [00:11<19:39,  2.98s/it][A[A[A[A[A[A[A[A[A[A









  1%|▏         | 5/400 [00:14<19:15,  2.92s/it][A[A[A[A[A[A[A[A[A[A









  2%|▏         | 6/400 [00:17<18:31,  2.82s/it][A[A[A[A[A[A[A[A[A[A









  2%|▏         | 7/400 [00:20<19:44,  3.01s/it][A[A[A[A[A[A[A[A[A[A









  2%|▏         | 8/400 [00:23<18:54,  2.89s/it][A[A[A[A[A[A[A[A[A[A









  2%|▏         | 9/400 [00:26<20:11,  3.10s/it][A[A[A[A[A[A[A[A[A[A









  2%|▎         | 10/400 [00:29<19:48,  3.05s/it][A[A[A[A[A[A[A[A[A[A









  3%|▎         | 11/400 [00:33<21:26,  3

 46%|████▋     | 185/400 [09:37<11:06,  3.10s/it][A[A[A[A[A[A[A[A[A[A









 46%|████▋     | 186/400 [09:40<10:59,  3.08s/it][A[A[A[A[A[A[A[A[A[A









 47%|████▋     | 187/400 [09:43<11:11,  3.15s/it][A[A[A[A[A[A[A[A[A[A









 47%|████▋     | 188/400 [09:47<11:28,  3.25s/it][A[A[A[A[A[A[A[A[A[A









 47%|████▋     | 189/400 [09:50<11:57,  3.40s/it][A[A[A[A[A[A[A[A[A[A









 48%|████▊     | 190/400 [09:53<11:29,  3.28s/it][A[A[A[A[A[A[A[A[A[A









 48%|████▊     | 191/400 [09:57<11:33,  3.32s/it][A[A[A[A[A[A[A[A[A[A









 48%|████▊     | 192/400 [10:00<11:40,  3.37s/it][A[A[A[A[A[A[A[A[A[A









 48%|████▊     | 193/400 [10:04<11:25,  3.31s/it][A[A[A[A[A[A[A[A[A[A









 48%|████▊     | 194/400 [10:06<10:51,  3.16s/it][A[A[A[A[A[A[A[A[A[A









 49%|████▉     | 195/400 [10:09<10:30,  3.07s/it][A[A[A[A[A[A[A[A[A[A









 49%|████▉     | 196/

 92%|█████████▏| 369/400 [19:17<01:35,  3.07s/it][A[A[A[A[A[A[A[A[A[A









 92%|█████████▎| 370/400 [19:20<01:29,  2.97s/it][A[A[A[A[A[A[A[A[A[A









 93%|█████████▎| 371/400 [19:23<01:28,  3.04s/it][A[A[A[A[A[A[A[A[A[A









 93%|█████████▎| 372/400 [19:27<01:27,  3.11s/it][A[A[A[A[A[A[A[A[A[A









 93%|█████████▎| 373/400 [19:30<01:22,  3.07s/it][A[A[A[A[A[A[A[A[A[A









 94%|█████████▎| 374/400 [19:33<01:19,  3.06s/it][A[A[A[A[A[A[A[A[A[A









 94%|█████████▍| 375/400 [19:36<01:18,  3.13s/it][A[A[A[A[A[A[A[A[A[A









 94%|█████████▍| 376/400 [19:39<01:14,  3.08s/it][A[A[A[A[A[A[A[A[A[A









 94%|█████████▍| 377/400 [19:42<01:11,  3.13s/it][A[A[A[A[A[A[A[A[A[A









 94%|█████████▍| 378/400 [19:45<01:09,  3.17s/it][A[A[A[A[A[A[A[A[A[A









 95%|█████████▍| 379/400 [19:49<01:08,  3.28s/it][A[A[A[A[A[A[A[A[A[A









 95%|█████████▌| 380/

1255.3883385658264


In [234]:
len(X1_encoded)

348

In [235]:
len(X2_encoded)

38

In [236]:
len(X3_encoded)

0

# This GPU processes odd folders

In [8]:
# Define face detection pipeline
detection_pipeline = DetectionPipeline(detector=mtcnn, batch_size=60, resize=None, n_frames=45)
start = time.time()
n_processed = 0
with torch.no_grad():
    for f in tqdm(np.arange(13, 50, 2), total = len(np.arange(13, 50, 2))):
        # Get all videos
        filenames = glob.glob('data/dfdc_train_part_' + str(f) + '/*.mp4')
        metadata = pd.read_json('data/dfdc_train_part_' + str(f) + '/metadata.json').T
        print('data/dfdc_train_part_' + str(f) + '/*.mp4 | '+ str(len(filenames)) + ' files')
        X1 = []
        X1_encoded = []
        Y1 = []
        X2 = []
        X2_encoded = []
        Y2 = []
        X3 = []
        X3_encoded = []
        Y3 = []
        start = time.time()
        n_processed = 0
        for i, filename in enumerate(filenames):
            try:
                # Load frames and find faces
                faces = detection_pipeline(filename)
                y = int((metadata.label['data/dfdc_train_part_' + str(f) + '/' + metadata.index == filename] == 'REAL') * 1)
                n_faces = [x.shape[0] if x is not None else 0 for x in faces ]
                faces = [x for x in faces if x is not None]
                if n_faces.count(3) >= 30:
                    f_faces = [x for x in faces if x.shape[0] == 3]
                    f_faces = [f_faces[i] for i in np.linspace(0, len(f_faces)-1, 30).astype(int)]
                    X3.append(f_faces)
                    X3_encoded.append(process_faces(f_faces, resnet))
                    Y3.append(y)
                elif n_faces.count(2) >= 30:
                    f_faces = [x for x in faces if x.shape[0] == 2]
                    f_faces = [f_faces[i] for i in np.linspace(0, len(f_faces)-1, 30).astype(int)]
                    X2.append(f_faces)
                    X2_encoded.append(process_faces(f_faces, resnet))
                    Y2.append(y)
                elif n_faces.count(1) >= 30:
                    f_faces = [x for x in faces if x.shape[0] == 1]
                    f_faces = [f_faces[i] for i in np.linspace(0, len(f_faces)-1, 30).astype(int)]
                    X1.append(f_faces)
                    X1_encoded.append(process_faces(f_faces, resnet))
                    Y1.append(y)
            except KeyboardInterrupt:
                print('\nStopped.')
                break

            except Exception as e:
                print(e)

        n_processed += len(faces)
        print(f'Frames per second (load+detect+embed): {n_processed / (time.time() - start):6.3}\r', end='')
        torch.save(X1_encoded, 'data_processed/1face_X_part' + str(f) + '.pt')
        torch.save(Y1, 'data_processed/1face_Y_part' + str(f) + '.pt')
        torch.save(X2_encoded, 'data_processed/2face_X_part' + str(f) + '.pt')
        torch.save(Y2, 'data_processed/2face_Y_part' + str(f) + '.pt')
        torch.save(X3_encoded, 'data_processed/3face_X_part' + str(f) + '.pt')
        torch.save(Y3, 'data_processed/3face_Y_part' + str(f) + '.pt')


  0%|          | 0/19 [00:00<?, ?it/s][A

data/dfdc_train_part_13/*.mp4 | 3694 files
Frames per second (load+detect+embed): 0.00456


  5%|▌         | 1/19 [2:44:31<49:21:20, 9871.15s/it][A

data/dfdc_train_part_15/*.mp4 | 2273 files
Frames per second (load+detect+embed): 0.0065


 11%|█         | 2/19 [4:40:07<42:27:22, 8990.71s/it][A

data/dfdc_train_part_17/*.mp4 | 2430 files
Frames per second (load+detect+embed): 0.00616


 16%|█▌        | 3/19 [6:42:34<37:46:02, 8497.63s/it][A

data/dfdc_train_part_19/*.mp4 | 2752 files
Frames per second (load+detect+embed): 0.00593


 21%|██        | 4/19 [8:49:41<34:19:06, 8236.42s/it][A

data/dfdc_train_part_21/*.mp4 | 2268 files
Frames per second (load+detect+embed): 0.00481


 26%|██▋       | 5/19 [10:48:15<30:43:13, 7899.55s/it][A

data/dfdc_train_part_23/*.mp4 | 2410 files
Frames per second (load+detect+embed): 0.00595


 32%|███▏      | 6/19 [12:54:59<28:12:21, 7810.88s/it][A

data/dfdc_train_part_25/*.mp4 | 2546 files
Frames per second (load+detect+embed): 0.00599


 37%|███▋      | 7/19 [15:00:23<25:44:57, 7724.82s/it][A

data/dfdc_train_part_27/*.mp4 | 2353 files
Frames per second (load+detect+embed): 0.00596


 42%|████▏     | 8/19 [17:06:26<23:27:19, 7676.35s/it][A

data/dfdc_train_part_29/*.mp4 | 2557 files
Frames per second (load+detect+embed): 0.0058


 47%|████▋     | 9/19 [19:16:01<21:24:19, 7705.94s/it][A

data/dfdc_train_part_31/*.mp4 | 2470 files
Frames per second (load+detect+embed): 0.00572


 53%|█████▎    | 10/19 [21:27:20<19:23:41, 7757.90s/it][A

data/dfdc_train_part_33/*.mp4 | 2274 files
Frames per second (load+detect+embed): 0.00644


 58%|█████▊    | 11/19 [23:24:06<16:44:18, 7532.31s/it][A

data/dfdc_train_part_35/*.mp4 | 2535 files
Frames per second (load+detect+embed): 0.00577


 63%|██████▎   | 12/19 [25:34:20<14:48:37, 7616.82s/it][A

data/dfdc_train_part_37/*.mp4 | 2655 files
Frames per second (load+detect+embed): 0.00586


 68%|██████▊   | 13/19 [27:42:39<12:44:09, 7641.64s/it][A

data/dfdc_train_part_39/*.mp4 | 2556 files
Frames per second (load+detect+embed): 0.00578


 74%|███████▎  | 14/19 [29:52:41<10:40:47, 7689.48s/it][A

data/dfdc_train_part_41/*.mp4 | 2222 files
Frames per second (load+detect+embed): 0.00534


 79%|███████▉  | 15/19 [32:13:19<8:47:36, 7914.16s/it] [A

data/dfdc_train_part_43/*.mp4 | 2546 files
Frames per second (load+detect+embed): 0.0059


 84%|████████▍ | 16/19 [34:20:36<6:31:33, 7831.03s/it][A

data/dfdc_train_part_45/*.mp4 | 2346 files
Frames per second (load+detect+embed): 0.00592


 89%|████████▉ | 17/19 [36:27:26<4:18:49, 7764.77s/it][A

data/dfdc_train_part_47/*.mp4 | 2406 files
Frames per second (load+detect+embed): 0.00547


 95%|█████████▍| 18/19 [38:32:32<2:08:07, 7687.14s/it][A

data/dfdc_train_part_49/*.mp4 | 3134 files
Frames per second (load+detect+embed): 0.00578


100%|██████████| 19/19 [40:42:34<00:00, 7721.53s/it]  [A