This notebook archives the code to process the full data set and save the encoded faces in `/data_processed/`

In [1]:
%%capture
!pip install mmcv

In [2]:
import os
import torch
import glob
import time
import numpy as np
import pandas as pd
import mmcv, cv2
from facenet_pytorch import MTCNN, InceptionResnetV1, extract_face
from PIL import Image, ImageDraw, ImageFont, ImageEnhance
from IPython import display
from tqdm import tqdm
from matplotlib import pyplot as plt

In [3]:
Image.__version__

'6.2.1'

In [4]:
device = torch.device('cuda:1' if torch.cuda.is_available() else 'cpu')
print('Running on device: {}'.format(device))

Running on device: cuda:1


In [5]:
# Load face detector
mtcnn = MTCNN(margin=14, keep_all=True, post_process=False, thresholds = [0.9, 0.9, 0.9], device=device).eval()

# Load facial recognition model, but I didn't want to use it yet
resnet = InceptionResnetV1(pretrained='vggface2', device=device).eval()

Downloading parameters (1/2)
Downloading parameters (2/2)


In [6]:
class DetectionPipeline:
    """Pipeline class for detecting faces in the frames of a video file."""
    
    def __init__(self, detector, n_frames=None, batch_size=60, resize=None):
        """Constructor for DetectionPipeline class.
        
        Keyword Arguments:
            n_frames {int} -- Total number of frames to load. These will be evenly spaced
                throughout the video. If not specified (i.e., None), all frames will be loaded.
                (default: {None})
            batch_size {int} -- Batch size to use with MTCNN face detector. (default: {32})
            resize {float} -- Fraction by which to resize frames from original prior to face
                detection. A value less than 1 results in downsampling and a value greater than
                1 result in upsampling. (default: {None})
        """
        self.detector = detector
        self.n_frames = n_frames
        self.batch_size = batch_size
        self.resize = resize
    
    def __call__(self, filename):
        """Load frames from an MP4 video and detect faces.

        Arguments:
            filename {str} -- Path to video.
        """
        # Create video reader and find length
        v_cap = cv2.VideoCapture(filename)
        v_len = int(v_cap.get(cv2.CAP_PROP_FRAME_COUNT))

        # Pick 'n_frames' evenly spaced frames to sample
        if self.n_frames is None:
            sample = np.arange(1, v_len)
        else:
            sample = np.linspace(1, v_len - 1, self.n_frames).astype(int)

        # Loop through frames
        faces = []
        frames = []
        for j in range(v_len):
            success = v_cap.grab()
            if j in sample:
                # Load frame
                success, frame = v_cap.retrieve()
                if not success:
                    continue
                frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
                max_p = frame.max()
                frame = Image.fromarray(frame)
                if max_p < 150:
                    enhancer = ImageEnhance.Brightness(frame)
                    frame = enhancer.enhance(255/max_p)
                    
                # Resize frame to desired size
                if self.resize is not None:
                    frame = frame.resize([int(d * self.resize) for d in frame.size])
                
                frames.append(frame)

                # When batch is full, detect faces and reset frame list
                if len(frames) % self.batch_size == 0 or j == sample[-1]:
                    faces.extend(self.detector(frames))
                    frames = []

        v_cap.release()

        return faces    


def process_faces(faces, resnet):
    # Filter out frames without faces
    faces = [f for f in faces if f is not None]
    faces = torch.cat(faces).to(device)

    # Generate facial feature vectors using a pretrained model
    embeddings = resnet(faces)

    # Calculate centroid for video and distance of each face's feature vector from centroid
#     centroid = embeddings.mean(dim=0)
#     x = (embeddings - centroid).norm(dim=1).cpu().numpy()
    
    return embeddings

# Sample Videos for speed test
    
* I see 400 videos took minutes 18 minutes on a pair of P100s. There are 292/400 videos with 1 face only, 16 with 2 faces and none with three in every frame. The same dataset took about 13 minutes on a pair of V100s, given the prices differences I will use P100s to process the entire thing. 
    
* An estimate of ETA is 2000 videos/folder * 50 folders * 18 minute/400videos = 4500 minutes or 75 hours. 
    
* However, I noticed that MTCNN only uses one GPU. Therefore I duplicated this workbook so that we have both GPUs running at the same time. Ideally it should half the time and we are looking at 1 day and half. 

In [10]:
# Define face detection pipeline
detection_pipeline = DetectionPipeline(detector=mtcnn, batch_size=60, resize=None, n_frames=45)

# Get all test videos
filenames = glob.glob('data/train_sample_videos/*.mp4')
metadata = pd.read_json('data/train_sample_videos/metadata.json').T

X1 = []
X1_encoded = []
Y1 = []
X2 = []
X2_encoded = []
Y2 = []
X3 = []
X3_encoded = []
Y3 = []
start = time.time()
n_processed = 0
with torch.no_grad():
    for i, filename in tqdm(enumerate(filenames), total=len(filenames)):
        try:
            # Load frames and find faces
            faces = detection_pipeline(filename)
            y = int((metadata.label['data/train_sample_videos/' + metadata.index == filename] == 'REAL') * 1)
            n_faces = [x.shape[0] if x is not None else 0 for x in faces ]
            faces = [x for x in faces if x is not None]
            if n_faces.count(3) >= 30:
                f_faces = [x for x in faces if x.shape[0] == 3]
                f_faces = [f_faces[i] for i in np.linspace(0, len(f_faces)-1, 30).astype(int)]
                X3.append(f_faces)
                X3_encoded.append(process_faces(f_faces, resnet))
                Y3.append(y)
            elif n_faces.count(2) >= 30:
                f_faces = [x for x in faces if x.shape[0] == 2]
                f_faces = [f_faces[i] for i in np.linspace(0, len(f_faces)-1, 30).astype(int)]
                X2.append(f_faces)
                X2_encoded.append(process_faces(f_faces, resnet))
                Y2.append(y)
            elif n_faces.count(1) >= 30:
                f_faces = [x for x in faces if x.shape[0] == 1]
                f_faces = [f_faces[i] for i in np.linspace(0, len(f_faces)-1, 30).astype(int)]
                X1.append(f_faces)
                X1_encoded.append(process_faces(f_faces, resnet))
                Y1.append(y)
            #             # 1 faces ----------
#             if [x.shape[0] for x in faces if x is not None] == [1] * 30:
#                 # Calculate embeddings
#                 X1.append(faces)
#                 X1_encoded.append(process_faces(faces, resnet))
#                 Y1.append(y)
#             # 2 faces ----------   
#             if [x.shape[0] for x in faces if x is not None] == [2] * 30:
#                 # Calculate embeddings
#                 X2.append(faces)
#                 X2_encoded.append(process_faces(faces, resnet))
#                 Y2.append(y)
#             # 3 faces ----------   
#             if [x.shape[0] for x in faces if x is not None] == [3] * 30:
#                 # Calculate embeddings
#                 X3.append(faces)
#                 X3_encoded.append(process_faces(faces, resnet))
#                 Y3.append(y)
        except KeyboardInterrupt:
            print('\nStopped.')
            break

        except Exception as e:
            print(e)
        
        n_processed += len(faces)

print(time.time() - start)


  0%|          | 0/400 [00:00<?, ?it/s][A
  0%|          | 1/400 [00:02<19:38,  2.95s/it][A
  0%|          | 2/400 [00:06<19:50,  2.99s/it][A
  1%|          | 3/400 [00:09<19:50,  3.00s/it][A
  1%|          | 4/400 [00:12<21:20,  3.23s/it][A
  1%|▏         | 5/400 [00:15<20:19,  3.09s/it][A
  2%|▏         | 6/400 [00:18<19:50,  3.02s/it][A
  2%|▏         | 7/400 [00:22<21:03,  3.22s/it][A
  2%|▏         | 8/400 [00:24<20:07,  3.08s/it][A
  2%|▏         | 9/400 [00:28<21:44,  3.34s/it][A
  2%|▎         | 10/400 [00:31<21:07,  3.25s/it][A
  3%|▎         | 11/400 [00:35<21:58,  3.39s/it][A
  3%|▎         | 12/400 [00:38<21:33,  3.33s/it][A
  3%|▎         | 13/400 [00:41<20:51,  3.23s/it][A
  4%|▎         | 14/400 [00:45<22:20,  3.47s/it][A
  4%|▍         | 15/400 [00:49<21:53,  3.41s/it][A
  4%|▍         | 16/400 [00:52<21:20,  3.33s/it][A
  4%|▍         | 17/400 [00:55<20:21,  3.19s/it][A
  4%|▍         | 18/400 [00:58<20:26,  3.21s/it][A
  5%|▍         | 19/400 [01:0

 78%|███████▊  | 310/400 [16:33<04:33,  3.04s/it][A
 78%|███████▊  | 311/400 [16:36<04:28,  3.02s/it][A
 78%|███████▊  | 312/400 [16:39<04:29,  3.06s/it][A
 78%|███████▊  | 313/400 [16:43<04:39,  3.22s/it][A
 78%|███████▊  | 314/400 [16:46<04:29,  3.13s/it][A
 79%|███████▉  | 315/400 [16:49<04:35,  3.24s/it][A
 79%|███████▉  | 316/400 [16:52<04:28,  3.19s/it][A
 79%|███████▉  | 317/400 [16:55<04:14,  3.07s/it][A
 80%|███████▉  | 318/400 [16:58<04:10,  3.06s/it][A
 80%|███████▉  | 319/400 [17:01<04:07,  3.06s/it][A
 80%|████████  | 320/400 [17:04<03:59,  3.00s/it][A
 80%|████████  | 321/400 [17:07<04:04,  3.10s/it][A
 80%|████████  | 322/400 [17:11<04:07,  3.17s/it][A
 81%|████████  | 323/400 [17:14<03:58,  3.10s/it][A
 81%|████████  | 324/400 [17:17<03:54,  3.09s/it][A
 81%|████████▏ | 325/400 [17:20<03:59,  3.19s/it][A
 82%|████████▏ | 326/400 [17:23<03:52,  3.14s/it][A
 82%|████████▏ | 327/400 [17:26<03:39,  3.01s/it][A
 82%|████████▏ | 328/400 [17:29<03:34,  2.98s/

1271.5324783325195


In [11]:
len(X1_encoded)

348

In [12]:
len(X2_encoded)

38

In [None]:
torch.save(X1_encoded, 'data_processed/sample_train_faces_encoded_1face.pt')
torch.save(Y1, 'data_processed/sample_train_faces_encoded_1face.pt')

# This GPU processes even folders

In [7]:
# Define face detection pipeline
detection_pipeline = DetectionPipeline(detector=mtcnn, batch_size=60, resize=None, n_frames=45)
start = time.time()
n_processed = 0
with torch.no_grad():
    for f in tqdm(np.arange(0, 49, 2), total = 25):
        # Get all videos
        filenames = glob.glob('data/dfdc_train_part_' + str(f) + '/*.mp4')
        metadata = pd.read_json('data/dfdc_train_part_' + str(f) + '/metadata.json').T
        print('data/dfdc_train_part_' + str(f) + '/*.mp4 | '+ str(len(filenames)) + ' files')
        X1 = []
        X1_encoded = []
        Y1 = []
        X2 = []
        X2_encoded = []
        Y2 = []
        X3 = []
        X3_encoded = []
        Y3 = []
        start = time.time()
        n_processed = 0
        for i, filename in enumerate(filenames):
            try:
                # Load frames and find faces
                faces = detection_pipeline(filename)
                y = int((metadata.label['data/dfdc_train_part_' + str(f) + '/' + metadata.index == filename] == 'REAL') * 1)
                n_faces = [x.shape[0] if x is not None else 0 for x in faces ]
                faces = [x for x in faces if x is not None]
                if n_faces.count(3) >= 30:
                    f_faces = [x for x in faces if x.shape[0] == 3]
                    f_faces = [f_faces[i] for i in np.linspace(0, len(f_faces)-1, 30).astype(int)]
                    X3.append(f_faces)
                    X3_encoded.append(process_faces(f_faces, resnet))
                    Y3.append(y)
                elif n_faces.count(2) >= 30:
                    f_faces = [x for x in faces if x.shape[0] == 2]
                    f_faces = [f_faces[i] for i in np.linspace(0, len(f_faces)-1, 30).astype(int)]
                    X2.append(f_faces)
                    X2_encoded.append(process_faces(f_faces, resnet))
                    Y2.append(y)
                elif n_faces.count(1) >= 30:
                    f_faces = [x for x in faces if x.shape[0] == 1]
                    f_faces = [f_faces[i] for i in np.linspace(0, len(f_faces)-1, 30).astype(int)]
                    X1.append(f_faces)
                    X1_encoded.append(process_faces(f_faces, resnet))
                    Y1.append(y)
            except KeyboardInterrupt:
                print('\nStopped.')
                break

            except Exception as e:
                print(e)

        n_processed += len(faces)
        print(f'Frames per second (load+detect+embed): {n_processed / (time.time() - start):6.3}\r', end='')
        torch.save(X1_encoded, 'data_processed/1face_X_part' + str(f) + '.pt')
        torch.save(Y1, 'data_processed/1face_Y_part' + str(f) + '.pt')
        torch.save(X2_encoded, 'data_processed/2face_X_part' + str(f) + '.pt')
        torch.save(Y2, 'data_processed/2face_Y_part' + str(f) + '.pt')
        torch.save(X3_encoded, 'data_processed/3face_X_part' + str(f) + '.pt')
        torch.save(Y3, 'data_processed/3face_Y_part' + str(f) + '.pt')

  0%|          | 0/25 [00:00<?, ?it/s]

data/dfdc_train_part_0/*.mp4 | 1334 files
Frames per second (load+detect+embed): 0.00974

  4%|▍         | 1/25 [1:17:09<30:51:36, 4629.02s/it]

data/dfdc_train_part_2/*.mp4 | 1748 files
Frames per second (load+detect+embed): 0.00766

  8%|▊         | 2/25 [2:55:11<31:58:33, 5004.94s/it]

data/dfdc_train_part_4/*.mp4 | 1701 files
Frames per second (load+detect+embed): 0.00689

 12%|█▏        | 3/25 [4:44:14<33:24:22, 5466.48s/it]

data/dfdc_train_part_6/*.mp4 | 3464 files
Frames per second (load+detect+embed): 0.00553

 16%|█▌        | 4/25 [7:00:08<36:35:28, 6272.79s/it]

data/dfdc_train_part_8/*.mp4 | 1816 files
Frames per second (load+detect+embed): 0.00579

 20%|██        | 5/25 [9:09:56<37:22:27, 6727.37s/it]

data/dfdc_train_part_10/*.mp4 | 3192 files
Frames per second (load+detect+embed): 0.00584

 24%|██▍       | 6/25 [11:19:13<37:08:05, 7036.07s/it]

data/dfdc_train_part_12/*.mp4 | 2225 files
Frames per second (load+detect+embed): 0.00609

 28%|██▊       | 7/25 [13:23:01<35:46:10, 7153.91s/it]

data/dfdc_train_part_14/*.mp4 | 2464 files
Frames per second (load+detect+embed): 0.00574

 32%|███▏      | 8/25 [15:34:24<34:48:50, 7372.36s/it]

data/dfdc_train_part_16/*.mp4 | 2061 files
Frames per second (load+detect+embed): 0.00702

 36%|███▌      | 9/25 [17:21:57<31:32:27, 7096.73s/it]

data/dfdc_train_part_18/*.mp4 | 2683 files
Frames per second (load+detect+embed): 0.00525

 40%|████      | 10/25 [19:45:29<31:27:49, 7551.31s/it]

data/dfdc_train_part_20/*.mp4 | 2154 files
Frames per second (load+detect+embed): 0.00628

 44%|████▍     | 11/25 [21:45:34<28:57:41, 7447.25s/it]

data/dfdc_train_part_22/*.mp4 | 2409 files
Frames per second (load+detect+embed): 0.00576

 48%|████▊     | 12/25 [23:56:21<27:19:33, 7567.16s/it]

data/dfdc_train_part_24/*.mp4 | 2786 files
Frames per second (load+detect+embed): 0.00509

 52%|█████▏    | 13/25 [26:24:19<26:32:07, 7960.60s/it]

data/dfdc_train_part_26/*.mp4 | 2433 files
Frames per second (load+detect+embed): 0.00611

 56%|█████▌    | 14/25 [28:27:22<23:47:41, 7787.41s/it]

data/dfdc_train_part_28/*.mp4 | 2085 files
Frames per second (load+detect+embed): 0.00673

 60%|██████    | 15/25 [30:19:00<20:43:24, 7460.40s/it]

data/dfdc_train_part_30/*.mp4 | 2236 files
Frames per second (load+detect+embed): 0.00637

 64%|██████▍   | 16/25 [32:16:55<18:21:42, 7344.68s/it]

data/dfdc_train_part_32/*.mp4 | 2356 files
Frames per second (load+detect+embed): 0.00634

 68%|██████▊   | 17/25 [34:15:26<16:09:58, 7274.79s/it]

data/dfdc_train_part_34/*.mp4 | 2658 files
could not broadcast input array from shape (0,5) into shape (0)
could not broadcast input array from shape (0,5) into shape (0)
could not broadcast input array from shape (0,5) into shape (0)
could not broadcast input array from shape (0,5) into shape (0)
could not broadcast input array from shape (0,5) into shape (0)
could not broadcast input array from shape (0,5) into shape (0)
Frames per second (load+detect+embed): 0.00543

 72%|███████▏  | 18/25 [36:33:53<14:44:49, 7584.24s/it]

data/dfdc_train_part_36/*.mp4 | 2339 files
CUDA out of memory. Tried to allocate 5.00 GiB (GPU 1; 15.78 GiB total capacity; 10.78 GiB already allocated; 3.92 GiB free; 10.79 GiB reserved in total by PyTorch)
CUDA out of memory. Tried to allocate 5.00 GiB (GPU 1; 15.78 GiB total capacity; 10.78 GiB already allocated; 433.19 MiB free; 14.29 GiB reserved in total by PyTorch)
CUDA out of memory. Tried to allocate 5.00 GiB (GPU 1; 15.78 GiB total capacity; 5.79 GiB already allocated; 4.59 GiB free; 10.12 GiB reserved in total by PyTorch)
CUDA out of memory. Tried to allocate 5.00 GiB (GPU 1; 15.78 GiB total capacity; 5.79 GiB already allocated; 4.59 GiB free; 10.12 GiB reserved in total by PyTorch)
CUDA out of memory. Tried to allocate 5.00 GiB (GPU 1; 15.78 GiB total capacity; 5.79 GiB already allocated; 4.59 GiB free; 10.12 GiB reserved in total by PyTorch)
CUDA out of memory. Tried to allocate 5.00 GiB (GPU 1; 15.78 GiB total capacity; 5.79 GiB already allocated; 4.59 GiB free; 10.12 GiB

CUDA out of memory. Tried to allocate 5.00 GiB (GPU 1; 15.78 GiB total capacity; 5.87 GiB already allocated; 4.51 GiB free; 10.21 GiB reserved in total by PyTorch)
CUDA out of memory. Tried to allocate 5.00 GiB (GPU 1; 15.78 GiB total capacity; 5.87 GiB already allocated; 4.51 GiB free; 10.21 GiB reserved in total by PyTorch)
CUDA out of memory. Tried to allocate 5.00 GiB (GPU 1; 15.78 GiB total capacity; 5.87 GiB already allocated; 4.51 GiB free; 10.21 GiB reserved in total by PyTorch)
CUDA out of memory. Tried to allocate 5.00 GiB (GPU 1; 15.78 GiB total capacity; 5.87 GiB already allocated; 4.51 GiB free; 10.21 GiB reserved in total by PyTorch)
CUDA out of memory. Tried to allocate 5.00 GiB (GPU 1; 15.78 GiB total capacity; 5.87 GiB already allocated; 4.51 GiB free; 10.21 GiB reserved in total by PyTorch)
CUDA out of memory. Tried to allocate 5.00 GiB (GPU 1; 15.78 GiB total capacity; 5.88 GiB already allocated; 4.51 GiB free; 10.21 GiB reserved in total by PyTorch)
CUDA out of memo

 76%|███████▌  | 19/25 [38:37:08<12:32:45, 7527.52s/it]

data/dfdc_train_part_38/*.mp4 | 2477 files
Frames per second (load+detect+embed): 0.00555

 80%|████████  | 20/25 [40:52:30<10:42:09, 7705.96s/it]

data/dfdc_train_part_40/*.mp4 | 2420 files
Frames per second (load+detect+embed): 0.00591

 84%|████████▍ | 21/25 [42:59:36<8:32:07, 7681.87s/it] 

data/dfdc_train_part_42/*.mp4 | 2384 files
Frames per second (load+detect+embed): 0.00584

 88%|████████▊ | 22/25 [45:08:08<6:24:33, 7691.02s/it]

data/dfdc_train_part_44/*.mp4 | 2665 files
Frames per second (load+detect+embed): 0.00559

 92%|█████████▏| 23/25 [47:22:32<4:20:05, 7802.95s/it]

data/dfdc_train_part_46/*.mp4 | 2202 files
Frames per second (load+detect+embed): 0.00641

 96%|█████████▌| 24/25 [49:19:49<2:06:13, 7573.20s/it]

data/dfdc_train_part_48/*.mp4 | 2463 files
Frames per second (load+detect+embed): 0.00582

100%|██████████| 25/25 [51:28:47<00:00, 7622.42s/it]  


In [15]:
len(X1_encoded)

1267