This notebook archives the code to process the full data set and save the encoded faces in `/data_processed/`

In [1]:
%%capture
!pip install mmcv

In [2]:
import os
import torch
import glob
import time
import numpy as np
import pandas as pd
import mmcv, cv2
from facenet_pytorch import MTCNN, InceptionResnetV1, extract_face
from PIL import Image, ImageDraw, ImageFont
from IPython import display
from tqdm import tqdm
from matplotlib import pyplot as plt

In [3]:
Image.__version__

'6.2.1'

In [4]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print('Running on device: {}'.format(device))

Running on device: cuda:0


In [5]:
# Load face detector
mtcnn = MTCNN(margin=14, keep_all=True, post_process=False, thresholds = [0.9, 0.9, 0.9], device=device).eval()

# Load facial recognition model, but I didn't want to use it yet
resnet = InceptionResnetV1(pretrained='vggface2', device=device).eval()

Downloading parameters (1/2)
Downloading parameters (2/2)


In [6]:
class DetectionPipeline:
    """Pipeline class for detecting faces in the frames of a video file."""
    
    def __init__(self, detector, n_frames=None, batch_size=60, resize=None):
        """Constructor for DetectionPipeline class.
        
        Keyword Arguments:
            n_frames {int} -- Total number of frames to load. These will be evenly spaced
                throughout the video. If not specified (i.e., None), all frames will be loaded.
                (default: {None})
            batch_size {int} -- Batch size to use with MTCNN face detector. (default: {32})
            resize {float} -- Fraction by which to resize frames from original prior to face
                detection. A value less than 1 results in downsampling and a value greater than
                1 result in upsampling. (default: {None})
        """
        self.detector = detector
        self.n_frames = n_frames
        self.batch_size = batch_size
        self.resize = resize
    
    def __call__(self, filename):
        """Load frames from an MP4 video and detect faces.

        Arguments:
            filename {str} -- Path to video.
        """
        # Create video reader and find length
        v_cap = cv2.VideoCapture(filename)
        v_len = int(v_cap.get(cv2.CAP_PROP_FRAME_COUNT))

        # Pick 'n_frames' evenly spaced frames to sample
        if self.n_frames is None:
            sample = np.arange(0, v_len)
        else:
            sample = np.linspace(0, v_len - 1, self.n_frames).astype(int)

        # Loop through frames
        faces = []
        frames = []
        for j in range(v_len):
            success = v_cap.grab()
            if j in sample:
                # Load frame
                success, frame = v_cap.retrieve()
                if not success:
                    continue
                frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
                frame = Image.fromarray(frame)
                
                # Resize frame to desired size
                if self.resize is not None:
                    frame = frame.resize([int(d * self.resize) for d in frame.size])
                frames.append(frame)

                # When batch is full, detect faces and reset frame list
                if len(frames) % self.batch_size == 0 or j == sample[-1]:
                    faces.extend(self.detector(frames))
                    frames = []

        v_cap.release()

        return faces    


def process_faces(faces, resnet):
    # Filter out frames without faces
    faces = [f for f in faces if f is not None]
    faces = torch.cat(faces).to(device)

    # Generate facial feature vectors using a pretrained model
    embeddings = resnet(faces)

    # Calculate centroid for video and distance of each face's feature vector from centroid
#     centroid = embeddings.mean(dim=0)
#     x = (embeddings - centroid).norm(dim=1).cpu().numpy()
    
    return embeddings

# Sample Videos for speed test
    
* I see 400 videos took minutes 18 minutes on a pair of P100s. There are 292/400 videos with 1 face only, 16 with 2 faces and none with three in every frame. The same dataset took about 13 minutes on a pair of V100s, given the prices differences I will use P100s to process the entire thing. 
    
* An estimate of ETA is 2000 videos/folder * 50 folders * 18 minute/400videos = 4500 minutes or 75 hours. 
    
* However, I noticed that MTCNN only uses one GPU. Therefore I duplicated this workbook so that we have both GPUs running at the same time. Ideally it should half the time and we are looking at 1 day and half. 

In [10]:
# Define face detection pipeline
detection_pipeline = DetectionPipeline(detector=mtcnn, batch_size=60, resize=None, n_frames=30)

# Get all test videos
filenames = glob.glob('data/train_sample_videos/*.mp4')
metadata = pd.read_json('data/train_sample_videos/metadata.json').T

X1 = []
X1_encoded = []
Y1 = []
X2 = []
X2_encoded = []
Y2 = []
X3 = []
X3_encoded = []
Y3 = []
start = time.time()
n_processed = 0
with torch.no_grad():
    for i, filename in tqdm(enumerate(filenames), total=len(filenames)):
        try:
            # Load frames and find faces
            faces = detection_pipeline(filename)
            y = int((metadata.label['data/train_sample_videos/' + metadata.index == filename] == 'REAL') * 1)
            # 1 faces ----------
            if [x.shape[0] for x in faces if x is not None] == [1] * 30:
                # Calculate embeddings
                X1.append(faces)
                X1_encoded.append(process_faces(faces, resnet))
                Y1.append(y)
            # 2 faces ----------   
            if [x.shape[0] for x in faces if x is not None] == [2] * 30:
                # Calculate embeddings
                X2.append(faces)
                X2_encoded.append(process_faces(faces, resnet))
                Y2.append(y)
            # 3 faces ----------   
            if [x.shape[0] for x in faces if x is not None] == [3] * 30:
                # Calculate embeddings
                X3.append(faces)
                X3_encoded.append(process_faces(faces, resnet))
                Y3.append(y)
        except KeyboardInterrupt:
            print('\nStopped.')
            break

        except Exception as e:
            print(e)
        
        n_processed += len(faces)
        print(f'Frames per second (load+detect+embed): {n_processed / (time.time() - start):6.3}\r', end='')

  0%|          | 1/400 [00:02<19:20,  2.91s/it]

Frames per second (load+detect+embed):   10.3

  0%|          | 2/400 [00:05<18:58,  2.86s/it]

Frames per second (load+detect+embed):   10.6

  1%|          | 3/400 [00:08<18:45,  2.83s/it]

Frames per second (load+detect+embed):   10.7

  1%|          | 4/400 [00:10<17:08,  2.60s/it]

Frames per second (load+detect+embed):   11.4

  1%|▏         | 5/400 [00:13<17:07,  2.60s/it]

Frames per second (load+detect+embed):   11.5

  2%|▏         | 6/400 [00:15<17:00,  2.59s/it]

Frames per second (load+detect+embed):   11.5

  2%|▏         | 7/400 [00:18<18:20,  2.80s/it]

Frames per second (load+detect+embed):   11.1

  2%|▏         | 8/400 [00:21<18:06,  2.77s/it]

Frames per second (load+detect+embed):   11.1

  2%|▏         | 9/400 [00:25<19:16,  2.96s/it]

Frames per second (load+detect+embed):   10.8

  2%|▎         | 10/400 [00:27<19:04,  2.93s/it]

Frames per second (load+detect+embed):   10.7

  3%|▎         | 11/400 [00:29<17:15,  2.66s/it]

Frames per second (load+detect+embed):   11.0

  3%|▎         | 12/400 [00:33<18:00,  2.78s/it]

Frames per second (load+detect+embed):   10.9

  3%|▎         | 13/400 [00:35<17:37,  2.73s/it]

Frames per second (load+detect+embed):   10.9

  4%|▎         | 14/400 [00:38<18:39,  2.90s/it]

Frames per second (load+detect+embed):   10.8

  4%|▍         | 15/400 [00:41<17:57,  2.80s/it]

Frames per second (load+detect+embed):   10.8

  4%|▍         | 16/400 [00:44<17:53,  2.80s/it]

Frames per second (load+detect+embed):   10.8

  4%|▍         | 17/400 [00:46<17:36,  2.76s/it]

Frames per second (load+detect+embed):   10.9

  4%|▍         | 18/400 [00:49<17:43,  2.78s/it]

Frames per second (load+detect+embed):   10.8

  5%|▍         | 19/400 [00:52<17:53,  2.82s/it]

Frames per second (load+detect+embed):   10.8

  5%|▌         | 20/400 [00:55<17:28,  2.76s/it]

Frames per second (load+detect+embed):   10.8

  5%|▌         | 21/400 [00:58<17:49,  2.82s/it]

Frames per second (load+detect+embed):   10.8

  6%|▌         | 22/400 [01:01<18:22,  2.92s/it]

Frames per second (load+detect+embed):   10.7

  6%|▌         | 23/400 [01:04<18:08,  2.89s/it]

Frames per second (load+detect+embed):   10.7

  6%|▌         | 24/400 [01:07<18:14,  2.91s/it]

Frames per second (load+detect+embed):   10.7

  6%|▋         | 25/400 [01:10<18:57,  3.03s/it]

Frames per second (load+detect+embed):   10.6

  6%|▋         | 26/400 [01:13<18:05,  2.90s/it]

Frames per second (load+detect+embed):   10.7

  7%|▋         | 27/400 [01:15<17:24,  2.80s/it]

Frames per second (load+detect+embed):   10.7

  7%|▋         | 28/400 [01:18<18:14,  2.94s/it]

Frames per second (load+detect+embed):   10.6

  7%|▋         | 29/400 [01:21<17:17,  2.80s/it]

Frames per second (load+detect+embed):   10.7

  8%|▊         | 30/400 [01:24<17:12,  2.79s/it]

Frames per second (load+detect+embed):   10.7

  8%|▊         | 31/400 [01:26<17:07,  2.78s/it]

Frames per second (load+detect+embed):   10.7
Stopped.


# This GPU processes odd folders

In [None]:
# Define face detection pipeline
detection_pipeline = DetectionPipeline(detector=mtcnn, batch_size=60, resize=None, n_frames=30)
start = time.time()
n_processed = 0
with torch.no_grad():
    for f in tqdm(np.arange(1, 50, 2), total = 25):
        # Get all videos
        filenames = glob.glob('data/dfdc_train_part_' + str(f) + '/*.mp4')
        metadata = pd.read_json('data/dfdc_train_part_' + str(f) + '/metadata.json').T
        print('data/dfdc_train_part_' + str(f) + '/*.mp4 | '+ str(len(filenames)) + ' files')
        X1 = []
        X1_encoded = []
        Y1 = []
        X2 = []
        X2_encoded = []
        Y2 = []
        X3 = []
        X3_encoded = []
        Y3 = []
        start = time.time()
        n_processed = 0
        for i, filename in enumerate(filenames):
            try:
                # Load frames and find faces
                faces = detection_pipeline(filename)
                y = int((metadata.label[i] == 'REAL') * 1)
                # 1 faces ----------
                if [x.shape[0] for x in faces if x is not None] == [1] * 30:
                    # Calculate embeddings
#                     X1.append(faces)
                    X1_encoded.append(process_faces(faces, resnet))
                    Y1.append(y)
                # 2 faces ----------   
                if [x.shape[0] for x in faces if x is not None] == [2] * 30:
                    # Calculate embeddings
#                     X2.append(faces)
                    X2_encoded.append(process_faces(faces, resnet))
                    Y2.append(y)
                # 3 faces ----------   
                if [x.shape[0] for x in faces if x is not None] == [3] * 30:
                    # Calculate embeddings
#                     X3.append(faces)
                    X3_encoded.append(process_faces(faces, resnet))
                    Y3.append(y)
            except KeyboardInterrupt:
                print('\nStopped.')
                break

            except Exception as e:
                print(e)

        n_processed += len(faces)
        print(f'Frames per second (load+detect+embed): {n_processed / (time.time() - start):6.3}\r', end='')
        torch.save(X1_encoded, 'data_processed/1face_X_part' + str(f) + '.pt')
        torch.save(Y1, 'data_processed/1face_Y_part' + str(f) + '.pt')
        torch.save(X2_encoded, 'data_processed/2face_X_part' + str(f) + '.pt')
        torch.save(Y2, 'data_processed/2face_Y_part' + str(f) + '.pt')
        torch.save(X3_encoded, 'data_processed/3face_X_part' + str(f) + '.pt')
        torch.save(Y3, 'data_processed/3face_Y_part' + str(f) + '.pt')

  0%|          | 0/25 [00:00<?, ?it/s]

data/dfdc_train_part_1/*.mp4 | 1699 files
